5895 5907 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGALLOC_TRACK_H #define _LINUX_PGALLOC_TRACK_H #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pgd_none(*pgd))) { if (__p4d_alloc(mm, pgd, address)) return NULL; *mod_mask |= PGTBL_PGD_MODIFIED; } return p4d_offset(pgd, address); } static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(p4d_none(*p4d))) { if (__pud_alloc(mm, p4d, address)) return NULL; *mod_mask |= PGTBL_P4D_MODIFIED; } return pud_offset(p4d, address); } static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pud_none(*pud))) { if (__pmd_alloc(mm, pud, address)) return NULL; *mod_mask |= PGTBL_PUD_MODIFIED; } return pmd_offset(pud, address); } #endif /* CONFIG_MMU */ #define pte_alloc_kernel_track(pmd, address, mask) \ ((unlikely(pmd_none(*(pmd))) && \ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ NULL: pte_offset_kernel(pmd, address)) #endif /* _LINUX_PGALLOC_TRACK_H */ |
2903 2005 2060 112 1113 3024 11634 2396 2397 2397 1406 2453 11136 2080 11078 11123 11136 11594 11596 11593 11123 11063 11125 138 3384 3129 3126 1069 1068 45 2107 2078 2078 2079 2079 2079 1053 2078 2077 2878 3292 492 124 139 139 139 124 787 154 435 397 154 700 1962 125 125 2693 2692 1872 1871 1871 173 1871 1872 1871 2692 3172 2753 2878 2853 3173 2845 1144 2844 3484 3296 3502 3502 3173 3172 1008 3485 3485 3485 3485 3483 3479 3295 3050 788 787 788 32 32 32 2193 1996 2193 1965 2009 2615 2422 2615 968 1020 990 262 988 987 512 505 512 488 488 474 488 488 488 1615 1616 1617 290 1616 1617 810 1394 1393 1406 1325 585 584 1325 1616 1330 2754 2754 325 2753 2753 2754 540 458 2409 2409 2409 2409 389 389 2409 2407 2408 2132 2077 2363 2362 798 911 2059 2059 2059 1904 1904 1829 1829 9656 9655 827 278 828 828 828 827 827 7 7 7 1 1 7 2059 2059 2 2 2 2 1063 1063 1063 1062 1062 1050 1050 1050 1050 1651 1651 32 32 152 153 23 136 153 152 121 120 119 134 134 134 134 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 | // SPDX-License-Identifier: GPL-2.0+ /* * XArray implementation * Copyright (c) 2017-2018 Microsoft Corporation * Copyright (c) 2018-2020 Oracle * Author: Matthew Wilcox <willy@infradead.org> */ #include <linux/bitmap.h> #include <linux/export.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Coding conventions in this file: * * @xa is used to refer to the entire xarray. * @xas is the 'xarray operation state'. It may be either a pointer to * an xa_state, or an xa_state stored on the stack. This is an unfortunate * ambiguity. * @index is the index of the entry being operated on * @mark is an xa_mark_t; a small number indicating one of the mark bits. * @node refers to an xa_node; usually the primary one being operated on by * this function. * @offset is the index into the slots array inside an xa_node. * @parent refers to the @xa_node closer to the head than @node. * @entry refers to something stored in a slot in the xarray */ static inline unsigned int xa_lock_type(const struct xarray *xa) { return (__force unsigned int)xa->xa_flags & 3; } static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_lock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_lock_bh(xas); else xas_lock(xas); } static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_unlock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_unlock_bh(xas); else xas_unlock(xas); } static inline bool xa_track_free(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_TRACK_FREE; } static inline bool xa_zero_busy(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_ZERO_BUSY; } static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) { if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) xa->xa_flags |= XA_FLAGS_MARK(mark); } static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) { if (xa->xa_flags & XA_FLAGS_MARK(mark)) xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); } static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) { return node->marks[(__force unsigned)mark]; } static inline bool node_get_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return test_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_set_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_set_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_clear_bit(offset, node_marks(node, mark)); } static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) { return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); } static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) { bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); } #define mark_inc(mark) do { \ mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ } while (0) /* * xas_squash_marks() - Merge all marks to the first entry * @xas: Array operation state. * * Set a mark on the first entry if any entry has it set. Clear marks on * all sibling entries. */ static void xas_squash_marks(const struct xa_state *xas) { unsigned int mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; if (!xas->xa_sibs) return; do { unsigned long *marks = xas->xa_node->marks[mark]; if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) continue; __set_bit(xas->xa_offset, marks); bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); } while (mark++ != (__force unsigned)XA_MARK_MAX); } /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { return (index >> node->shift) & XA_CHUNK_MASK; } static void xas_set_offset(struct xa_state *xas) { xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); } /* move the index either forwards (find) or backwards (sibling slot) */ static void xas_move_index(struct xa_state *xas, unsigned long offset) { unsigned int shift = xas->xa_node->shift; xas->xa_index &= ~XA_CHUNK_MASK << shift; xas->xa_index += offset << shift; } static void xas_next_offset(struct xa_state *xas) { xas->xa_offset++; xas_move_index(xas, xas->xa_offset); } static void *set_bounds(struct xa_state *xas) { xas->xa_node = XAS_BOUNDS; return NULL; } /* * Starts a walk. If the @xas is already valid, we assume that it's on * the right path and just return where we've got to. If we're in an * error state, return NULL. If the index is outside the current scope * of the xarray, return NULL without changing @xas->xa_node. Otherwise * set @xas->xa_node to NULL and return the current head of the array. */ static void *xas_start(struct xa_state *xas) { void *entry; if (xas_valid(xas)) return xas_reload(xas); if (xas_error(xas)) return NULL; entry = xa_head(xas->xa); if (!xa_is_node(entry)) { if (xas->xa_index) return set_bounds(xas); } else { if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) return set_bounds(xas); } xas->xa_node = NULL; return entry; } static void *xas_descend(struct xa_state *xas, struct xa_node *node) { unsigned int offset = get_offset(xas->xa_index, node); void *entry = xa_entry(xas->xa, node, offset); xas->xa_node = node; while (xa_is_sibling(entry)) { offset = xa_to_sibling(entry); entry = xa_entry(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) entry = XA_RETRY_ENTRY; } xas->xa_offset = offset; return entry; } /** * xas_load() - Load an entry from the XArray (advanced). * @xas: XArray operation state. * * Usually walks the @xas to the appropriate state to load the entry * stored at xa_index. However, it will do nothing and return %NULL if * @xas is in an error state. xas_load() will never expand the tree. * * If the xa_state is set up to operate on a multi-index entry, xas_load() * may return %NULL or an internal entry, even if there are entries * present within the range specified by @xas. * * Context: Any context. The caller should hold the xa_lock or the RCU lock. * Return: Usually an entry in the XArray, but see description for exceptions. */ void *xas_load(struct xa_state *xas) { void *entry = xas_start(xas); while (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); if (xas->xa_shift > node->shift) break; entry = xas_descend(xas, node); if (node->shift == 0) break; } return entry; } EXPORT_SYMBOL_GPL(xas_load); #define XA_RCU_FREE ((struct xarray *)1) static void xa_node_free(struct xa_node *node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->array = XA_RCU_FREE; call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * xas_destroy() - Free any resources allocated during the XArray operation. * @xas: XArray operation state. * * Most users will not need to call this function; it is called for you * by xas_nomem(). */ void xas_destroy(struct xa_state *xas) { struct xa_node *next, *node = xas->xa_alloc; while (node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); next = rcu_dereference_raw(node->parent); radix_tree_node_rcu_free(&node->rcu_head); xas->xa_alloc = node = next; } } /** * xas_nomem() - Allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * If we need to add new nodes to the XArray, we try to allocate memory * with GFP_NOWAIT while holding the lock, which will usually succeed. * If it fails, @xas is flagged as needing memory to continue. The caller * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, * the caller should retry the operation. * * Forward progress is guaranteed as one node is allocated here and * stored in the xa_state where it will be found by xas_alloc(). More * nodes will likely be found in the slab allocator, but we do not tie * them up here. * * Return: true if memory was needed, and was successfully allocated. */ bool xas_nomem(struct xa_state *xas, gfp_t gfp) { if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } EXPORT_SYMBOL_GPL(xas_nomem); /* * __xas_nomem() - Drop locks and allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * Internal variant of xas_nomem(). * * Return: true if memory was needed, and was successfully allocated. */ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) __must_hold(xas->xa->xa_lock) { unsigned int lock_type = xa_lock_type(xas->xa); if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; if (gfpflags_allow_blocking(gfp)) { xas_unlock_type(xas, lock_type); xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); xas_lock_type(xas, lock_type); } else { xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); } if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } static void xas_update(struct xa_state *xas, struct xa_node *node) { if (xas->xa_update) xas->xa_update(node); else XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); } static void *xas_alloc(struct xa_state *xas, unsigned int shift) { struct xa_node *parent = xas->xa_node; struct xa_node *node = xas->xa_alloc; if (xas_invalid(xas)) return NULL; if (node) { xas->xa_alloc = NULL; } else { gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) { xas_set_err(xas, -ENOMEM); return NULL; } } if (parent) { node->offset = xas->xa_offset; parent->count++; XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); xas_update(xas, parent); } XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->shift = shift; node->count = 0; node->nr_values = 0; RCU_INIT_POINTER(node->parent, xas->xa_node); node->array = xas->xa; return node; } #ifdef CONFIG_XARRAY_MULTI /* Returns the number of indices covered by a given xa_state */ static unsigned long xas_size(const struct xa_state *xas) { return (xas->xa_sibs + 1UL) << xas->xa_shift; } #endif /* * Use this to calculate the maximum index that will need to be created * in order to add the entry described by @xas. Because we cannot store a * multi-index entry at index 0, the calculation is a little more complex * than you might expect. */ static unsigned long xas_max(struct xa_state *xas) { unsigned long max = xas->xa_index; #ifdef CONFIG_XARRAY_MULTI if (xas->xa_shift || xas->xa_sibs) { unsigned long mask = xas_size(xas) - 1; max |= mask; if (mask == max) max++; } #endif return max; } /* The maximum index that can be contained in the array without expanding it */ static unsigned long max_index(void *entry) { if (!xa_is_node(entry)) return 0; return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; } static void xas_shrink(struct xa_state *xas) { struct xarray *xa = xas->xa; struct xa_node *node = xas->xa_node; for (;;) { void *entry; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count != 1) break; entry = xa_entry_locked(xa, node, 0); if (!entry) break; if (!xa_is_node(entry) && node->shift) break; if (xa_is_zero(entry) && xa_zero_busy(xa)) entry = NULL; xas->xa_node = XAS_BOUNDS; RCU_INIT_POINTER(xa->xa_head, entry); if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) xa_mark_clear(xa, XA_FREE_MARK); node->count = 0; node->nr_values = 0; if (!xa_is_node(entry)) RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); xas_update(xas, node); xa_node_free(node); if (!xa_is_node(entry)) break; node = xa_to_node(entry); node->parent = NULL; } } /* * xas_delete_node() - Attempt to delete an xa_node * @xas: Array operation state. * * Attempts to delete the @xas->xa_node. This will fail if xa->node has * a non-zero reference count. */ static void xas_delete_node(struct xa_state *xas) { struct xa_node *node = xas->xa_node; for (;;) { struct xa_node *parent; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count) break; parent = xa_parent_locked(xas->xa, node); xas->xa_node = parent; xas->xa_offset = node->offset; xa_node_free(node); if (!parent) { xas->xa->xa_head = NULL; xas->xa_node = XAS_BOUNDS; return; } parent->slots[xas->xa_offset] = NULL; parent->count--; XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); node = parent; xas_update(xas, node); } if (!node->parent) xas_shrink(xas); } /** * xas_free_nodes() - Free this node and all nodes that it references * @xas: Array operation state. * @top: Node to free * * This node has been removed from the tree. We must now free it and all * of its subnodes. There may be RCU walkers with references into the tree, * so we must replace all entries with retry markers. */ static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) { unsigned int offset = 0; struct xa_node *node = top; for (;;) { void *entry = xa_entry_locked(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) { node = xa_to_node(entry); offset = 0; continue; } if (entry) RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); offset++; while (offset == XA_CHUNK_SIZE) { struct xa_node *parent; parent = xa_parent_locked(xas->xa, node); offset = node->offset + 1; node->count = 0; node->nr_values = 0; xas_update(xas, node); xa_node_free(node); if (node == top) return; node = parent; } } } /* * xas_expand adds nodes to the head of the tree until it has reached * sufficient height to be able to contain @xas->xa_index */ static int xas_expand(struct xa_state *xas, void *head) { struct xarray *xa = xas->xa; struct xa_node *node = NULL; unsigned int shift = 0; unsigned long max = xas_max(xas); if (!head) { if (max == 0) return 0; while ((max >> shift) >= XA_CHUNK_SIZE) shift += XA_CHUNK_SHIFT; return shift + XA_CHUNK_SHIFT; } else if (xa_is_node(head)) { node = xa_to_node(head); shift = node->shift + XA_CHUNK_SHIFT; } xas->xa_node = NULL; while (max > max_index(head)) { xa_mark_t mark = 0; XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); node = xas_alloc(xas, shift); if (!node) return -ENOMEM; node->count = 1; if (xa_is_value(head)) node->nr_values = 1; RCU_INIT_POINTER(node->slots[0], head); /* Propagate the aggregated mark info to the new child */ for (;;) { if (xa_track_free(xa) && mark == XA_FREE_MARK) { node_mark_all(node, XA_FREE_MARK); if (!xa_marked(xa, XA_FREE_MARK)) { node_clear_mark(node, 0, XA_FREE_MARK); xa_mark_set(xa, XA_FREE_MARK); } } else if (xa_marked(xa, mark)) { node_set_mark(node, 0, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } /* * Now that the new node is fully initialised, we can add * it to the tree */ if (xa_is_node(head)) { xa_to_node(head)->offset = 0; rcu_assign_pointer(xa_to_node(head)->parent, node); } head = xa_mk_node(node); rcu_assign_pointer(xa->xa_head, head); xas_update(xas, node); shift += XA_CHUNK_SHIFT; } xas->xa_node = node; return shift; } /* * xas_create() - Create a slot to store an entry in. * @xas: XArray operation state. * @allow_root: %true if we can store the entry in the root directly * * Most users will not need to call this function directly, as it is called * by xas_store(). It is useful for doing conditional store operations * (see the xa_cmpxchg() implementation for an example). * * Return: If the slot already existed, returns the contents of this slot. * If the slot was newly created, returns %NULL. If it failed to create the * slot, returns %NULL and indicates the error in @xas. */ static void *xas_create(struct xa_state *xas, bool allow_root) { struct xarray *xa = xas->xa; void *entry; void __rcu **slot; struct xa_node *node = xas->xa_node; int shift; unsigned int order = xas->xa_shift; if (xas_top(node)) { entry = xa_head_locked(xa); xas->xa_node = NULL; if (!entry && xa_zero_busy(xa)) entry = XA_ZERO_ENTRY; shift = xas_expand(xas, entry); if (shift < 0) return NULL; if (!shift && !allow_root) shift = XA_CHUNK_SHIFT; entry = xa_head_locked(xa); slot = &xa->xa_head; } else if (xas_error(xas)) { return NULL; } else if (node) { unsigned int offset = xas->xa_offset; shift = node->shift; entry = xa_entry_locked(xa, node, offset); slot = &node->slots[offset]; } else { shift = 0; entry = xa_head_locked(xa); slot = &xa->xa_head; } while (shift > order) { shift -= XA_CHUNK_SHIFT; if (!entry) { node = xas_alloc(xas, shift); if (!node) break; if (xa_track_free(xa)) node_mark_all(node, XA_FREE_MARK); rcu_assign_pointer(*slot, xa_mk_node(node)); } else if (xa_is_node(entry)) { node = xa_to_node(entry); } else { break; } entry = xas_descend(xas, node); slot = &node->slots[xas->xa_offset]; } return entry; } /** * xas_create_range() - Ensure that stores to this range will succeed * @xas: XArray operation state. * * Creates all of the slots in the range covered by @xas. Sets @xas to * create single-index entries and positions it at the beginning of the * range. This is for the benefit of users which have not yet been * converted to use multi-index entries. */ void xas_create_range(struct xa_state *xas) { unsigned long index = xas->xa_index; unsigned char shift = xas->xa_shift; unsigned char sibs = xas->xa_sibs; xas->xa_index |= ((sibs + 1UL) << shift) - 1; if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) xas->xa_offset |= sibs; xas->xa_shift = 0; xas->xa_sibs = 0; for (;;) { xas_create(xas, true); if (xas_error(xas)) goto restore; if (xas->xa_index <= (index | XA_CHUNK_MASK)) goto success; xas->xa_index -= XA_CHUNK_SIZE; for (;;) { struct xa_node *node = xas->xa_node; if (node->shift >= shift) break; xas->xa_node = xa_parent_locked(xas->xa, node); xas->xa_offset = node->offset - 1; if (node->offset != 0) break; } } restore: xas->xa_shift = shift; xas->xa_sibs = sibs; xas->xa_index = index; return; success: xas->xa_index = index; if (xas->xa_node) xas_set_offset(xas); } EXPORT_SYMBOL_GPL(xas_create_range); static void update_node(struct xa_state *xas, struct xa_node *node, int count, int values) { if (!node || (!count && !values)) return; node->count += count; node->nr_values += values; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); xas_update(xas, node); if (count < 0) xas_delete_node(xas); } /** * xas_store() - Store this entry in the XArray. * @xas: XArray operation state. * @entry: New entry. * * If @xas is operating on a multi-index entry, the entry returned by this * function is essentially meaningless (it may be an internal entry or it * may be %NULL, even if there are non-NULL entries at some of the indices * covered by the range). This is not a problem for any current users, * and can be changed if needed. * * Return: The old entry at this index. */ void *xas_store(struct xa_state *xas, void *entry) { struct xa_node *node; void __rcu **slot = &xas->xa->xa_head; unsigned int offset, max; int count = 0; int values = 0; void *first, *next; bool value = xa_is_value(entry); if (entry) { bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry); first = xas_create(xas, allow_root); } else { first = xas_load(xas); } if (xas_invalid(xas)) return first; node = xas->xa_node; if (node && (xas->xa_shift < node->shift)) xas->xa_sibs = 0; if ((first == entry) && !xas->xa_sibs) return first; next = first; offset = xas->xa_offset; max = xas->xa_offset + xas->xa_sibs; if (node) { slot = &node->slots[offset]; if (xas->xa_sibs) xas_squash_marks(xas); } if (!entry) xas_init_marks(xas); for (;;) { /* * Must clear the marks before setting the entry to NULL, * otherwise xas_for_each_marked may find a NULL entry and * stop early. rcu_assign_pointer contains a release barrier * so the mark clearing will appear to happen before the * entry is set to NULL. */ rcu_assign_pointer(*slot, entry); if (xa_is_node(next) && (!node || node->shift)) xas_free_nodes(xas, xa_to_node(next)); if (!node) break; count += !next - !entry; values += !xa_is_value(first) - !value; if (entry) { if (offset == max) break; if (!xa_is_sibling(entry)) entry = xa_mk_sibling(xas->xa_offset); } else { if (offset == XA_CHUNK_MASK) break; } next = xa_entry_locked(xas->xa, node, ++offset); if (!xa_is_sibling(next)) { if (!entry && (offset > max)) break; first = next; } slot++; } update_node(xas, node, count, values); return first; } EXPORT_SYMBOL_GPL(xas_store); /** * xas_get_mark() - Returns the state of this mark. * @xas: XArray operation state. * @mark: Mark number. * * Return: true if the mark is set, false if the mark is clear or @xas * is in an error state. */ bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) { if (xas_invalid(xas)) return false; if (!xas->xa_node) return xa_marked(xas->xa, mark); return node_get_mark(xas->xa_node, xas->xa_offset, mark); } EXPORT_SYMBOL_GPL(xas_get_mark); /** * xas_set_mark() - Sets the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Sets the specified mark on this entry, and walks up the tree setting it * on all the ancestor entries. Does nothing if @xas has not been walked to * an entry, or is in an error state. */ void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (node_set_mark(node, offset, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (!xa_marked(xas->xa, mark)) xa_mark_set(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_set_mark); /** * xas_clear_mark() - Clears the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Clears the specified mark on this entry, and walks back to the head * attempting to clear it on all the ancestor entries. Does nothing if * @xas has not been walked to an entry, or is in an error state. */ void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (!node_clear_mark(node, offset, mark)) return; if (node_any_mark(node, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (xa_marked(xas->xa, mark)) xa_mark_clear(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_clear_mark); /** * xas_init_marks() - Initialise all marks for the entry * @xas: Array operations state. * * Initialise all marks for the entry specified by @xas. If we're tracking * free entries with a mark, we need to set it on all entries. All other * marks are cleared. * * This implementation is not as efficient as it could be; we may walk * up the tree multiple times. */ void xas_init_marks(const struct xa_state *xas) { xa_mark_t mark = 0; for (;;) { if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) xas_set_mark(xas, mark); else xas_clear_mark(xas, mark); if (mark == XA_MARK_MAX) break; mark_inc(mark); } } EXPORT_SYMBOL_GPL(xas_init_marks); #ifdef CONFIG_XARRAY_MULTI static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) { unsigned int marks = 0; xa_mark_t mark = XA_MARK_0; for (;;) { if (node_get_mark(node, offset, mark)) marks |= 1 << (__force unsigned int)mark; if (mark == XA_MARK_MAX) break; mark_inc(mark); } return marks; } static void node_set_marks(struct xa_node *node, unsigned int offset, struct xa_node *child, unsigned int marks) { xa_mark_t mark = XA_MARK_0; for (;;) { if (marks & (1 << (__force unsigned int)mark)) { node_set_mark(node, offset, mark); if (child) node_mark_all(child, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } } /** * xas_split_alloc() - Allocate memory for splitting an entry. * @xas: XArray operation state. * @entry: New entry which will be stored in the array. * @order: Current entry order. * @gfp: Memory allocation flags. * * This function should be called before calling xas_split(). * If necessary, it will allocate new nodes (and fill them with @entry) * to prepare for the upcoming split of an entry of @order size into * entries of the order stored in the @xas. * * Context: May sleep if @gfp flags permit. */ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int mask = xas->xa_sibs; /* XXX: no support for splitting really large entries yet */ if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order)) goto nomem; if (xas->xa_shift + XA_CHUNK_SHIFT > order) return; do { unsigned int i; void *sibling = NULL; struct xa_node *node; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) goto nomem; node->array = xas->xa; for (i = 0; i < XA_CHUNK_SIZE; i++) { if ((i & mask) == 0) { RCU_INIT_POINTER(node->slots[i], entry); sibling = xa_mk_sibling(i); } else { RCU_INIT_POINTER(node->slots[i], sibling); } } RCU_INIT_POINTER(node->parent, xas->xa_alloc); xas->xa_alloc = node; } while (sibs-- > 0); return; nomem: xas_destroy(xas); xas_set_err(xas, -ENOMEM); } EXPORT_SYMBOL_GPL(xas_split_alloc); /** * xas_split() - Split a multi-index entry into smaller entries. * @xas: XArray operation state. * @entry: New entry to store in the array. * @order: Current entry order. * * The size of the new entries is set in @xas. The value in @entry is * copied to all the replacement entries. * * Context: Any context. The caller should hold the xa_lock. */ void xas_split(struct xa_state *xas, void *entry, unsigned int order) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int offset, marks; struct xa_node *node; void *curr = xas_load(xas); int values = 0; node = xas->xa_node; if (xas_top(node)) return; marks = node_get_marks(node, xas->xa_offset); offset = xas->xa_offset + sibs; do { if (xas->xa_shift < node->shift) { struct xa_node *child = xas->xa_alloc; xas->xa_alloc = rcu_dereference_raw(child->parent); child->shift = node->shift - XA_CHUNK_SHIFT; child->offset = offset; child->count = XA_CHUNK_SIZE; child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); node_set_marks(node, offset, child, marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) values--; xas_update(xas, child); } else { unsigned int canon = offset - xas->xa_sibs; node_set_marks(node, canon, NULL, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], xa_mk_sibling(canon)); values += (xa_is_value(entry) - xa_is_value(curr)) * (xas->xa_sibs + 1); } } while (offset-- > xas->xa_offset); node->nr_values += values; xas_update(xas, node); } EXPORT_SYMBOL_GPL(xas_split); #endif /** * xas_pause() - Pause a walk to drop a lock. * @xas: XArray operation state. * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @xas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call xas_pause(), the xa_for_each() * iterator may be more appropriate. * * Note that xas_pause() only works for forward iteration. If a user needs * to pause a reverse iteration, we will need a xas_pause_rev(). */ void xas_pause(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (xas_invalid(xas)) return; xas->xa_node = XAS_RESTART; if (node) { unsigned long offset = xas->xa_offset; while (++offset < XA_CHUNK_SIZE) { if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) break; } xas->xa_index += (offset - xas->xa_offset) << node->shift; if (xas->xa_index == 0) xas->xa_node = XAS_BOUNDS; } else { xas->xa_index++; } } EXPORT_SYMBOL_GPL(xas_pause); /* * __xas_prev() - Find the previous entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_prev() which handles all the complex cases * out of line. */ void *__xas_prev(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index--; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset--; while (xas->xa_offset == 255) { xas->xa_offset = xas->xa_node->offset - 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_prev); /* * __xas_next() - Find the next entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_next() which handles all the complex cases * out of line. */ void *__xas_next(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index++; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset++; while (xas->xa_offset == XA_CHUNK_SIZE) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_next); /** * xas_find() - Find the next present entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * * If the @xas has not yet been walked to an entry, return the entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we move to the * next entry. * * If no entry is found and the array is smaller than @max, the iterator * is set to the smallest index not yet in the array. This allows @xas * to be immediately passed to xas_store(). * * Return: The entry, if found, otherwise %NULL. */ void *xas_find(struct xa_state *xas, unsigned long max) { void *entry; if (xas_error(xas) || xas->xa_node == XAS_BOUNDS) return NULL; if (xas->xa_index > max) return set_bounds(xas); if (!xas->xa_node) { xas->xa_index = 1; return set_bounds(xas); } else if (xas->xa_node == XAS_RESTART) { entry = xas_load(xas); if (entry || xas_not_node(xas->xa_node)) return entry; } else if (!xas->xa_node->shift && xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; } xas_next_offset(xas); while (xas->xa_node && (xas->xa_index <= max)) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_node(entry)) { xas->xa_node = xa_to_node(entry); xas->xa_offset = 0; continue; } if (entry && !xa_is_sibling(entry)) return entry; xas_next_offset(xas); } if (!xas->xa_node) xas->xa_node = XAS_BOUNDS; return NULL; } EXPORT_SYMBOL_GPL(xas_find); /** * xas_find_marked() - Find the next marked entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark number to search for. * * If the @xas has not yet been walked to an entry, return the marked entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we return the * first marked entry with an index > xas.xa_index. * * If no marked entry is found and the array is smaller than @max, @xas is * set to the bounds state and xas->xa_index is set to the smallest index * not yet in the array. This allows @xas to be immediately passed to * xas_store(). * * If no entry is found before @max is reached, @xas is set to the restart * state. * * Return: The entry, if found, otherwise %NULL. */ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { bool advance = true; unsigned int offset; void *entry; if (xas_error(xas)) return NULL; if (xas->xa_index > max) goto max; if (!xas->xa_node) { xas->xa_index = 1; goto out; } else if (xas_top(xas->xa_node)) { advance = false; entry = xa_head(xas->xa); xas->xa_node = NULL; if (xas->xa_index > max_index(entry)) goto out; if (!xa_is_node(entry)) { if (xa_marked(xas->xa, mark)) return entry; xas->xa_index = 1; goto out; } xas->xa_node = xa_to_node(entry); xas->xa_offset = xas->xa_index >> xas->xa_node->shift; } while (xas->xa_index <= max) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) break; advance = false; continue; } if (!advance) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_sibling(entry)) { xas->xa_offset = xa_to_sibling(entry); xas_move_index(xas, xas->xa_offset); } } offset = xas_find_chunk(xas, advance, mark); if (offset > xas->xa_offset) { advance = false; xas_move_index(xas, offset); /* Mind the wrap */ if ((xas->xa_index - 1) >= max) goto max; xas->xa_offset = offset; if (offset == XA_CHUNK_SIZE) continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) continue; if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } out: if (xas->xa_index > max) goto max; return set_bounds(xas); max: xas->xa_node = XAS_RESTART; return NULL; } EXPORT_SYMBOL_GPL(xas_find_marked); /** * xas_find_conflict() - Find the next present entry in a range. * @xas: XArray operation state. * * The @xas describes both a range and a position within that range. * * Context: Any context. Expects xa_lock to be held. * Return: The next entry in the range covered by @xas or %NULL. */ void *xas_find_conflict(struct xa_state *xas) { void *curr; if (xas_error(xas)) return NULL; if (!xas->xa_node) return NULL; if (xas_top(xas->xa_node)) { curr = xas_start(xas); if (!curr) return NULL; while (xa_is_node(curr)) { struct xa_node *node = xa_to_node(curr); curr = xas_descend(xas, node); } if (curr) return curr; } if (xas->xa_node->shift > xas->xa_shift) return NULL; for (;;) { if (xas->xa_node->shift == xas->xa_shift) { if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) break; } else if (xas->xa_offset == XA_CHUNK_MASK) { xas->xa_offset = xas->xa_node->offset; xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); if (!xas->xa_node) break; continue; } curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); if (xa_is_sibling(curr)) continue; while (xa_is_node(curr)) { xas->xa_node = xa_to_node(curr); xas->xa_offset = 0; curr = xa_entry_locked(xas->xa, xas->xa_node, 0); } if (curr) return curr; } xas->xa_offset -= xas->xa_sibs; return NULL; } EXPORT_SYMBOL_GPL(xas_find_conflict); /** * xa_load() - Load an entry from an XArray. * @xa: XArray. * @index: index into array. * * Context: Any context. Takes and releases the RCU lock. * Return: The entry at @index in @xa. */ void *xa_load(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); do { entry = xas_load(&xas); if (xa_is_zero(entry)) entry = NULL; } while (xas_retry(&xas, entry)); rcu_read_unlock(); return entry; } EXPORT_SYMBOL(xa_load); static void *xas_result(struct xa_state *xas, void *curr) { if (xa_is_zero(curr)) return NULL; if (xas_error(xas)) curr = xas->xa_node; return curr; } /** * __xa_erase() - Erase this entry from the XArray while locked. * @xa: XArray. * @index: Index into array. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Expects xa_lock to be held on entry. * Return: The entry which used to be at this index. */ void *__xa_erase(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); return xas_result(&xas, xas_store(&xas, NULL)); } EXPORT_SYMBOL(__xa_erase); /** * xa_erase() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock. * Return: The entry which used to be at this index. */ void *xa_erase(struct xarray *xa, unsigned long index) { void *entry; xa_lock(xa); entry = __xa_erase(xa, index); xa_unlock(xa); return entry; } EXPORT_SYMBOL(xa_erase); /** * __xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old entry at this index or xa_err() if an error happened. */ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); if (xa_track_free(xa) && !entry) entry = XA_ZERO_ENTRY; do { curr = xas_store(&xas, entry); if (xa_track_free(xa)) xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); } EXPORT_SYMBOL(__xa_store); /** * xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from this index will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation * failed. */ void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock(xa); return curr; } EXPORT_SYMBOL(xa_store); /** * __xa_cmpxchg() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New entry. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old entry at this index or xa_err() if an error happened. */ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); do { curr = xas_load(&xas); if (curr == old) { xas_store(&xas, entry); if (xa_track_free(xa) && entry && !curr) xas_clear_mark(&xas, XA_FREE_MARK); } } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); } EXPORT_SYMBOL(__xa_cmpxchg); /** * __xa_insert() - Store this entry in the XArray if no entry is present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; do { curr = xas_load(&xas); if (!curr) { xas_store(&xas, entry); if (xa_track_free(xa)) xas_clear_mark(&xas, XA_FREE_MARK); } else { xas_set_err(&xas, -EBUSY); } } while (__xas_nomem(&xas, gfp)); return xas_error(&xas); } EXPORT_SYMBOL(__xa_insert); #ifdef CONFIG_XARRAY_MULTI static void xas_set_range(struct xa_state *xas, unsigned long first, unsigned long last) { unsigned int shift = 0; unsigned long sibs = last - first; unsigned int offset = XA_CHUNK_MASK; xas_set(xas, first); while ((first & XA_CHUNK_MASK) == 0) { if (sibs < XA_CHUNK_MASK) break; if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) break; shift += XA_CHUNK_SHIFT; if (offset == XA_CHUNK_MASK) offset = sibs & XA_CHUNK_MASK; sibs >>= XA_CHUNK_SHIFT; first >>= XA_CHUNK_SHIFT; } offset = first & XA_CHUNK_MASK; if (offset + sibs > XA_CHUNK_MASK) sibs = XA_CHUNK_MASK - offset; if ((((first + sibs + 1) << shift) - 1) > last) sibs -= 1; xas->xa_shift = shift; xas->xa_sibs = sibs; } /** * xa_store_range() - Store this entry at a range of indices in the XArray. * @xa: XArray. * @first: First index to affect. * @last: Last index to affect. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from any index between @first and @last, * inclusive will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Process context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in * an XArray, or xa_err(-ENOMEM) if memory allocation failed. */ void *xa_store_range(struct xarray *xa, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_internal(entry))) return XA_ERROR(-EINVAL); if (last < first) return XA_ERROR(-EINVAL); do { xas_lock(&xas); if (entry) { unsigned int order = BITS_PER_LONG; if (last + 1) order = __ffs(last + 1); xas_set_order(&xas, last, order); xas_create(&xas, true); if (xas_error(&xas)) goto unlock; } do { xas_set_range(&xas, first, last); xas_store(&xas, entry); if (xas_error(&xas)) goto unlock; first += xas_size(&xas); } while (first <= last); unlock: xas_unlock(&xas); } while (xas_nomem(&xas, gfp)); return xas_result(&xas, NULL); } EXPORT_SYMBOL(xa_store_range); /** * xa_get_order() - Get the order of an entry. * @xa: XArray. * @index: Index of the entry. * * Return: A number between 0 and 63 indicating the order of the entry. */ int xa_get_order(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); void *entry; int order = 0; rcu_read_lock(); entry = xas_load(&xas); if (!entry) goto unlock; if (!xas.xa_node) goto unlock; for (;;) { unsigned int slot = xas.xa_offset + (1 << order); if (slot >= XA_CHUNK_SIZE) break; if (!xa_is_sibling(xas.xa_node->slots[slot])) break; order++; } order += xas.xa_node->shift; unlock: rcu_read_unlock(); return order; } EXPORT_SYMBOL(xa_get_order); #endif /* CONFIG_XARRAY_MULTI */ /** * __xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @limit: Range for allocated ID. * @entry: New entry. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ int __xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (WARN_ON_ONCE(!xa_track_free(xa))) return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; do { xas.xa_index = limit.min; xas_find_marked(&xas, limit.max, XA_FREE_MARK); if (xas.xa_node == XAS_RESTART) xas_set_err(&xas, -EBUSY); else *id = xas.xa_index; xas_store(&xas, entry); xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_error(&xas); } EXPORT_SYMBOL(__xa_alloc); /** * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { u32 min = limit.min; int ret; limit.min = max(min, *next); ret = __xa_alloc(xa, id, entry, limit, gfp); if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) { xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && limit.min > min) { limit.min = min; ret = __xa_alloc(xa, id, entry, limit, gfp); if (ret == 0) ret = 1; } if (ret >= 0) { *next = *id + 1; if (*next == 0) xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED; } return ret; } EXPORT_SYMBOL(__xa_alloc_cyclic); /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_set_mark(&xas, mark); } EXPORT_SYMBOL(__xa_set_mark); /** * __xa_clear_mark() - Clear this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_clear_mark(&xas, mark); } EXPORT_SYMBOL(__xa_clear_mark); /** * xa_get_mark() - Inquire whether this mark is set on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * This function uses the RCU read lock, so the result may be out of date * by the time it returns. If you need the result to be stable, use a lock. * * Context: Any context. Takes and releases the RCU lock. * Return: True if the entry at @index has this mark set, false if it doesn't. */ bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); entry = xas_start(&xas); while (xas_get_mark(&xas, mark)) { if (!xa_is_node(entry)) goto found; entry = xas_descend(&xas, xa_to_node(entry)); } rcu_read_unlock(); return false; found: rcu_read_unlock(); return true; } EXPORT_SYMBOL(xa_get_mark); /** * xa_set_mark() - Set this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Process context. Takes and releases the xa_lock. */ void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_set_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_set_mark); /** * xa_clear_mark() - Clear this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Clearing a mark always succeeds. * * Context: Process context. Takes and releases the xa_lock. */ void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_clear_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_clear_mark); /** * xa_find() - Search the XArray for an entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter, and has the lowest * index that is at least @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may not find * entries which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The entry, if found, otherwise %NULL. */ void *xa_find(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find); static bool xas_sibling(struct xa_state *xas) { struct xa_node *node = xas->xa_node; unsigned long mask; if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node) return false; mask = (XA_CHUNK_SIZE << node->shift) - 1; return (xas->xa_index & mask) > ((unsigned long)xas->xa_offset << node->shift); } /** * xa_find_after() - Search the XArray for a present entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter and has the lowest * index that is above @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may miss entries * which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The pointer, if found, otherwise %NULL. */ void *xa_find_after(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp + 1); void *entry; if (xas.xa_index == 0) return NULL; rcu_read_lock(); for (;;) { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); if (xas_invalid(&xas)) break; if (xas_sibling(&xas)) continue; if (!xas_retry(&xas, entry)) break; } rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find_after); static unsigned int xas_extract_present(struct xa_state *xas, void **dst, unsigned long max, unsigned int n) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each(xas, entry, max) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, unsigned long max, unsigned int n, xa_mark_t mark) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each_marked(xas, entry, max, mark) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } /** * xa_extract() - Copy selected entries from the XArray into a normal array. * @xa: The source XArray to copy from. * @dst: The buffer to copy entries into. * @start: The first index in the XArray eligible to be selected. * @max: The last index in the XArray eligible to be selected. * @n: The maximum number of entries to copy. * @filter: Selection criterion. * * Copies up to @n entries that match @filter from the XArray. The * copied entries will have indices between @start and @max, inclusive. * * The @filter may be an XArray mark value, in which case entries which are * marked with that mark will be copied. It may also be %XA_PRESENT, in * which case all entries which are not %NULL will be copied. * * The entries returned may not represent a snapshot of the XArray at a * moment in time. For example, if another thread stores to index 5, then * index 10, calling xa_extract() may return the old contents of index 5 * and the new contents of index 10. Indices not modified while this * function is running will not be skipped. * * If you need stronger guarantees, holding the xa_lock across calls to this * function will prevent concurrent modification. * * Context: Any context. Takes and releases the RCU lock. * Return: The number of entries copied. */ unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t filter) { XA_STATE(xas, xa, start); if (!n) return 0; if ((__force unsigned int)filter < XA_MAX_MARKS) return xas_extract_marked(&xas, dst, max, n, filter); return xas_extract_present(&xas, dst, max, n); } EXPORT_SYMBOL(xa_extract); /** * xa_delete_node() - Private interface for workingset code. * @node: Node to be removed from the tree. * @update: Function to call to update ancestor nodes. * * Context: xa_lock must be held on entry and will not be released. */ void xa_delete_node(struct xa_node *node, xa_update_node_t update) { struct xa_state xas = { .xa = node->array, .xa_index = (unsigned long)node->offset << (node->shift + XA_CHUNK_SHIFT), .xa_shift = node->shift + XA_CHUNK_SHIFT, .xa_offset = node->offset, .xa_node = xa_parent_locked(node->array, node), .xa_update = update, }; xas_store(&xas, NULL); } EXPORT_SYMBOL_GPL(xa_delete_node); /* For the benefit of the test suite */ /** * xa_destroy() - Free all internal data structures. * @xa: XArray. * * After calling this function, the XArray is empty and has freed all memory * allocated for its internal data structures. You are responsible for * freeing the objects referenced by the XArray. * * Context: Any context. Takes and releases the xa_lock, interrupt-safe. */ void xa_destroy(struct xarray *xa) { XA_STATE(xas, xa, 0); unsigned long flags; void *entry; xas.xa_node = NULL; xas_lock_irqsave(&xas, flags); entry = xa_head_locked(xa); RCU_INIT_POINTER(xa->xa_head, NULL); xas_init_marks(&xas); if (xa_zero_busy(xa)) xa_mark_clear(xa, XA_FREE_MARK); /* lockdep checks we're still holding the lock in xas_free_nodes() */ if (xa_is_node(entry)) xas_free_nodes(&xas, xa_to_node(entry)); xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(xa_destroy); #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { unsigned i, j; if (!node) return; if ((unsigned long)node & 3) { pr_cont("node %px\n", node); return; } pr_cont("node %px %s %d parent %px shift %d count %d values %d " "array %px list %px %px marks", node, node->parent ? "offset" : "max", node->offset, node->parent, node->shift, node->count, node->nr_values, node->array, node->private_list.prev, node->private_list.next); for (i = 0; i < XA_MAX_MARKS; i++) for (j = 0; j < XA_MARK_LONGS; j++) pr_cont(" %lx", node->marks[i][j]); pr_cont("\n"); } void xa_dump_index(unsigned long index, unsigned int shift) { if (!shift) pr_info("%lu: ", index); else if (shift >= BITS_PER_LONG) pr_info("0-%lu: ", ~0UL); else pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); } void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) { if (!entry) return; xa_dump_index(index, shift); if (xa_is_node(entry)) { if (shift == 0) { pr_cont("%px\n", entry); } else { unsigned long i; struct xa_node *node = xa_to_node(entry); xa_dump_node(node); for (i = 0; i < XA_CHUNK_SIZE; i++) xa_dump_entry(node->slots[i], index + (i << node->shift), node->shift); } } else if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (!xa_is_internal(entry)) pr_cont("%px\n", entry); else if (xa_is_retry(entry)) pr_cont("retry (%ld)\n", xa_to_internal(entry)); else if (xa_is_sibling(entry)) pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else pr_cont("UNKNOWN ENTRY (%px)\n", entry); } void xa_dump(const struct xarray *xa) { void *entry = xa->xa_head; unsigned int shift = 0; pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, xa->xa_flags, xa_marked(xa, XA_MARK_0), xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); if (xa_is_node(entry)) shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; xa_dump_entry(entry, 0, shift); } #endif |
1365 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPLETION_H #define __LINUX_COMPLETION_H /* * (C) Copyright 2001 Linus Torvalds * * Atomic wait-for-completion handler data structures. * See kernel/sched/completion.c for details. */ #include <linux/swait.h> /* * struct completion - structure used to maintain state for a "completion" * * This is the opaque structure used to maintain the state for a "completion". * Completions currently use a FIFO to queue threads that have to wait for * the "completion" event. * * See also: complete(), wait_for_completion() (and friends _timeout, * _interruptible, _interruptible_timeout, and _killable), init_completion(), * reinit_completion(), and macros DECLARE_COMPLETION(), * DECLARE_COMPLETION_ONSTACK(). */ struct completion { unsigned int done; struct swait_queue_head wait; }; #define init_completion_map(x, m) init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} #define COMPLETION_INITIALIZER(work) \ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) } #define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \ (*({ init_completion_map(&(work), &(map)); &(work); })) #define COMPLETION_INITIALIZER_ONSTACK(work) \ (*({ init_completion(&work); &work; })) /** * DECLARE_COMPLETION - declare and initialize a completion structure * @work: identifier for the completion structure * * This macro declares and initializes a completion structure. Generally used * for static declarations. You should use the _ONSTACK variant for automatic * variables. */ #define DECLARE_COMPLETION(work) \ struct completion work = COMPLETION_INITIALIZER(work) /* * Lockdep needs to run a non-constant initializer for on-stack * completions - so we use the _ONSTACK() variant for those that * are on the kernel stack: */ /** * DECLARE_COMPLETION_ONSTACK - declare and initialize a completion structure * @work: identifier for the completion structure * * This macro declares and initializes a completion structure on the kernel * stack. */ #ifdef CONFIG_LOCKDEP # define DECLARE_COMPLETION_ONSTACK(work) \ struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) # define DECLARE_COMPLETION_ONSTACK_MAP(work, map) \ struct completion work = COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) #else # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) # define DECLARE_COMPLETION_ONSTACK_MAP(work, map) DECLARE_COMPLETION(work) #endif /** * init_completion - Initialize a dynamically allocated completion * @x: pointer to completion structure that is to be initialized * * This inline function will initialize a dynamically created completion * structure. */ static inline void init_completion(struct completion *x) { x->done = 0; init_swait_queue_head(&x->wait); } /** * reinit_completion - reinitialize a completion structure * @x: pointer to completion structure that is to be reinitialized * * This inline function should be used to reinitialize a completion structure so it can * be reused. This is especially important after complete_all() is used. */ static inline void reinit_completion(struct completion *x) { x->done = 0; } extern void wait_for_completion(struct completion *); extern void wait_for_completion_io(struct completion *); extern int wait_for_completion_interruptible(struct completion *x); extern int wait_for_completion_killable(struct completion *x); extern int wait_for_completion_state(struct completion *x, unsigned int state); extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); extern unsigned long wait_for_completion_io_timeout(struct completion *x, unsigned long timeout); extern long wait_for_completion_interruptible_timeout( struct completion *x, unsigned long timeout); extern long wait_for_completion_killable_timeout( struct completion *x, unsigned long timeout); extern bool try_wait_for_completion(struct completion *x); extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_on_current_cpu(struct completion *x); extern void complete_all(struct completion *); #endif |
24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 56 56 56 56 56 56 60 60 60 60 60 59 58 58 58 58 58 55 55 56 56 56 56 56 56 57 5 2 246 245 245 245 245 26 246 246 27 245 246 246 23 245 245 245 245 12 245 245 245 245 245 245 245 245 243 245 22 22 211 2 209 246 243 26 26 26 73 68 67 66 66 63 58 23 61 73 31 32 32 32 32 7 255 255 255 248 248 247 243 248 7 8 298 298 297 296 263 296 294 2 2 294 24 24 1 1 4 1 3 1 1 1 1030 73 3 4 4 4 188 187 184 184 184 3 180 339 338 337 335 335 335 75 32 32 19 19 215 215 214 61 59 58 176 176 176 175 4 8 1 2 1 77 21 6 6 1 5 5 6 1 6 8 299 1 31 5 1 1553 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ioctl.c * * Copyright (C) 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) */ #include <linux/fs.h> #include <linux/capability.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/quotaops.h> #include <linux/random.h> #include <linux/uaccess.h> #include <linux/delay.h> #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/uuid.h> #include "ext4_jbd2.h" #include "ext4.h" #include <linux/fsmap.h> #include "fsmap.h" #include <trace/events/ext4.h> typedef void ext4_update_sb_callback(struct ext4_super_block *es, const void *arg); /* * Superblock modification callback function for changing file system * label */ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX); } /* * Superblock modification callback function for changing file system * UUID. */ static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } static int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, ext4_update_sb_callback func, const void *arg) { int err = 0; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = sbi->s_sbh; struct ext4_super_block *es = sbi->s_es; trace_ext4_update_sb(sb, bh->b_blocknr, 1); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto out_err; lock_buffer(bh); func(es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_err; err = sync_dirty_buffer(bh); out_err: ext4_std_error(sb, err); return err; } /* * Update one backup superblock in the group 'grp' using the callback * function 'func' and argument 'arg'. If the handle is NULL the * modification is not journalled. * * Returns: 0 when no modification was done (no superblock in the group) * 1 when the modification was successful * <0 on error */ static int ext4_update_backup_sb(struct super_block *sb, handle_t *handle, ext4_group_t grp, ext4_update_sb_callback func, const void *arg) { int err = 0; ext4_fsblk_t sb_block; struct buffer_head *bh; unsigned long offset = 0; struct ext4_super_block *es; if (!ext4_bg_has_super(sb, grp)) return 0; /* * For the group 0 there is always 1k padding, so we have * either adjust offset, or sb_block depending on blocksize */ if (grp == 0) { sb_block = 1 * EXT4_MIN_BLOCK_SIZE; offset = do_div(sb_block, sb->s_blocksize); } else { sb_block = ext4_group_first_block_no(sb, grp); offset = 0; } trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0); bh = ext4_sb_bread(sb, sb_block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (handle) { BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto out_bh; } es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); if (ext4_has_metadata_csum(sb) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); goto out_bh; } func(es, arg); if (ext4_has_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(sb, es); set_buffer_uptodate(bh); unlock_buffer(bh); if (handle) { err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_bh; } else { BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); } err = sync_dirty_buffer(bh); out_bh: brelse(bh); ext4_std_error(sb, err); return (err) ? err : 1; } /* * Update primary and backup superblocks using the provided function * func and argument arg. * * Only the primary superblock and at most two backup superblock * modifications are journalled; the rest is modified without journal. * This is safe because e2fsck will re-write them if there is a problem, * and we're very unlikely to ever need more than two backups. */ static int ext4_update_superblocks_fn(struct super_block *sb, ext4_update_sb_callback func, const void *arg) { handle_t *handle; ext4_group_t ngroups; unsigned int three = 1; unsigned int five = 5; unsigned int seven = 7; int err = 0, ret, i; ext4_group_t grp, primary_grp; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * We can't update superblocks while the online resize is running */ if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "Can't modify superblock while" "performing online resize"); return -EBUSY; } /* * We're only going to update primary superblock and two * backup superblocks in this transaction. */ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out; } /* Update primary superblock */ err = ext4_update_primary_sb(sb, handle, func, arg); if (err) { ext4_msg(sb, KERN_ERR, "Failed to update primary " "superblock"); goto out_journal; } primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr); ngroups = ext4_get_groups_count(sb); /* * Update backup superblocks. We have to start from group 0 * because it might not be where the primary superblock is * if the fs is mounted with -o sb=<backup_sb_block> */ i = 0; grp = 0; while (grp < ngroups) { /* Skip primary superblock */ if (grp == primary_grp) goto next_grp; ret = ext4_update_backup_sb(sb, handle, grp, func, arg); if (ret < 0) { /* Ignore bad checksum; try to update next sb */ if (ret == -EFSBADCRC) goto next_grp; err = ret; goto out_journal; } i += ret; if (handle && i > 1) { /* * We're only journalling primary superblock and * two backup superblocks; the rest is not * journalled. */ err = ext4_journal_stop(handle); if (err) goto out; handle = NULL; } next_grp: grp = ext4_list_backups(sb, &three, &five, &seven); } out_journal: if (handle) { ret = ext4_journal_stop(handle); if (ret && !err) err = ret; } out: clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags); smp_mb__after_atomic(); return err ? err : 0; } /* * Swap memory between @a and @b for @len bytes. * * @a: pointer to first memory area * @b: pointer to second memory area * @len: number of bytes to swap * */ static void memswap(void *a, void *b, size_t len) { unsigned char *ap, *bp; ap = (unsigned char *)a; bp = (unsigned char *)b; while (len-- > 0) { swap(*ap, *bp); ap++; bp++; } } /* * Swap i_data and associated attributes between @inode1 and @inode2. * This function is used for the primary swap between inode1 and inode2 * and also to revert this primary swap in case of errors. * * Therefore you have to make sure, that calling this method twice * will revert all changes. * * @inode1: pointer to first inode * @inode2: pointer to second inode */ static void swap_inode_data(struct inode *inode1, struct inode *inode2) { loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; unsigned long tmp; struct timespec64 ts1, ts2; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); ts1 = inode_get_atime(inode1); ts2 = inode_get_atime(inode2); inode_set_atime_to_ts(inode1, ts2); inode_set_atime_to_ts(inode2, ts1); ts1 = inode_get_mtime(inode1); ts2 = inode_get_mtime(inode2); inode_set_mtime_to_ts(inode1, ts2); inode_set_mtime_to_ts(inode2, ts1); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); i_size_write(inode2, isize); } void ext4_reset_inode_seed(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; if (!ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } /* * Swap the information from the given @inode and the inode * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other * important fields of the inodes. * * @sb: the super block of the filesystem * @idmap: idmap of the mount the inode was found from * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, struct mnt_idmap *idmap, struct inode *inode) { handle_t *handle; int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; qsize_t size, size_bl, diff; blkcnt_t blocks; unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL | EXT4_IGET_BAD); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || ext4_has_inline_data(inode)) { err = -EINVAL; goto journal_err_out; } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || !inode_owner_or_capable(idmap, inode) || !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; } filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err) goto err_out; err = filemap_write_and_wait(inode_bl->i_mapping); if (err) goto err_out; /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(&inode_bl->i_data, 0); handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; goto err_out; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { /* this inode has never been used as a BOOT_LOADER */ set_nlink(inode_bl, 1); i_uid_write(inode_bl, 0); i_gid_write(inode_bl, 0); inode_bl->i_flags = 0; ei_bl->i_flags = 0; inode_set_iversion(inode_bl, 1); i_size_write(inode_bl, 0); EXT4_I(inode_bl)->i_disksize = inode_bl->i_size; inode_bl->i_mode = S_IFREG; if (ext4_has_feature_extents(sb)) { ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode_bl); } else memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } err = dquot_initialize(inode); if (err) goto err_out1; size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; diff = size - size_bl; swap_inode_data(inode, inode_bl); inode_set_ctime_current(inode); inode_set_ctime_current(inode_bl); inode_inc_iversion(inode); inode->i_generation = get_random_u32(); inode_bl->i_generation = get_random_u32(); ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); ext4_discard_preallocations(inode, 0); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); goto err_out1; } blocks = inode_bl->i_blocks; bytes = inode_bl->i_bytes; inode_bl->i_blocks = inode->i_blocks; inode_bl->i_bytes = inode->i_bytes; err = ext4_mark_inode_dirty(handle, inode_bl); if (err < 0) { /* No need to update quota information. */ ext4_warning(inode_bl->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode_bl->i_ino, err); goto revert; } /* Bootloader inode should not be counted into quota information. */ if (diff > 0) dquot_free_space(inode, diff); else err = dquot_alloc_space(inode, -1 * diff); if (err < 0) { revert: /* Revert all changes: */ inode_bl->i_blocks = blocks; inode_bl->i_bytes = bytes; swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); ext4_mark_inode_dirty(handle, inode_bl); } err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); err_out: filemap_invalidate_unlock(inode->i_mapping); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); return err; } /* * If immutable is set and we are not clearing it, we're not allowed to change * anything else in the inode. Don't error out if we're only trying to set * immutable on an immutable file. */ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int oldflags = ei->i_flags; if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL)) return 0; if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL)) return -EPERM; if (ext4_has_feature_project(inode->i_sb) && __kprojid_val(ei->i_projid) != new_projid) return -EPERM; return 0; } static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); if (S_ISDIR(inode->i_mode)) return; if (test_opt2(inode->i_sb, DAX_NEVER) || test_opt(inode->i_sb, DAX_ALWAYS)) return; if ((ei->i_flags ^ flags) & EXT4_DAX_FL) d_mark_dontcache(inode); } static bool dax_compatible(struct inode *inode, unsigned int oldflags, unsigned int flags) { /* Allow the DAX flag to be changed on inline directories */ if (S_ISDIR(inode->i_mode)) { flags &= ~EXT4_INLINE_DATA_FL; oldflags &= ~EXT4_INLINE_DATA_FL; } if (flags & EXT4_DAX_FL) { if ((oldflags & EXT4_DAX_MUT_EXCL) || ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS)) { return false; } } if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) return false; return true; } static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); handle_t *handle = NULL; int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; struct super_block *sb = inode->i_sb; /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) goto flags_out; oldflags = ei->i_flags; /* * The JOURNAL_DATA flag can only be changed by * the relevant capability. */ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } if (!dax_compatible(inode, oldflags, flags)) { err = -EOPNOTSUPP; goto flags_out; } if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) { if (!ext4_has_feature_casefold(sb)) { err = -EOPNOTSUPP; goto flags_out; } if (!S_ISDIR(inode->i_mode)) { err = -ENOTDIR; goto flags_out; } if (!ext4_empty_dir(inode)) { err = -ENOTEMPTY; goto flags_out; } } /* * Wait for all pending directio and then flush all the dirty pages * for this file. The flush marks all the pages readonly, so any * subsequent attempt to write to the file (particularly mmap pages) * will come through the filesystem and fail. */ if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) && (flags & EXT4_IMMUTABLE_FL)) { inode_dio_wait(inode); err = filemap_write_and_wait(inode->i_mapping); if (err) goto flags_out; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto flags_out; } if (IS_SYNC(inode)) ext4_handle_sync(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto flags_err; ext4_dax_dontcache(inode, flags); for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; /* These flags get special treatment later */ if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) continue; if (mask & flags) ext4_set_inode_flag(inode, i); else ext4_clear_inode_flag(inode, i); } ext4_set_inode_flags(inode, false); inode_set_ctime_current(inode); inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext4_journal_stop(handle); if (err) goto flags_out; if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to * S_DAX if the inode is DAX */ if (IS_DAX(inode)) { err = -EBUSY; goto flags_out; } err = ext4_change_inode_journal_flag(inode, flags & EXT4_JOURNAL_DATA_FL); if (err) goto flags_out; } if (migrate) { if (flags & EXT4_EXTENTS_FL) err = ext4_ext_migrate(inode); else err = ext4_ind_migrate(inode); } flags_out: return err; } #ifdef CONFIG_QUOTA static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) { struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); int err, rc; handle_t *handle; kprojid_t kprojid; struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else return 0; } if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) return -EOPNOTSUPP; kprojid = make_kprojid(&init_user_ns, (projid_t)projid); if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; err = -EPERM; /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) return err; err = dquot_initialize(inode); if (err) return err; err = ext4_get_inode_loc(inode, &iloc); if (err) return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { err = ext4_expand_extra_isize(inode, EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) return err; } else { brelse(iloc.bh); } handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); if (IS_ERR(handle)) return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out_stop; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { /* __dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; } EXT4_I(inode)->i_projid = kprojid; inode_set_ctime_current(inode); inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; out_stop: ext4_journal_stop(handle); return err; } #else static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; return 0; } #endif int ext4_force_shutdown(struct super_block *sb, u32 flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); int ret; if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); trace_ext4_shutdown(sb, flags); switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: ret = bdev_freeze(sb->s_bdev); if (ret) return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { (void) ext4_force_commit(sb); jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); } break; case EXT4_GOING_FLAGS_NOLOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); break; default: return -EINVAL; } clear_opt(sb, DISCARD); return 0; } static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg) { u32 flags; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(flags, (__u32 __user *)arg)) return -EFAULT; return ext4_force_shutdown(sb, flags); } struct getfsmap_info { struct super_block *gi_sb; struct fsmap_head __user *gi_data; unsigned int gi_idx; __u32 gi_last_flags; }; static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) { struct getfsmap_info *info = priv; struct fsmap fm; trace_ext4_getfsmap_mapping(info->gi_sb, xfm); info->gi_last_flags = xfm->fmr_flags; ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, sizeof(struct fsmap))) return -EFAULT; return 0; } static int ext4_ioc_getfsmap(struct super_block *sb, struct fsmap_head __user *arg) { struct getfsmap_info info = { NULL }; struct ext4_fsmap_head xhead = {0}; struct fsmap_head head; bool aborted = false; int error; if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) return -EFAULT; if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || memchr_inv(head.fmh_keys[0].fmr_reserved, 0, sizeof(head.fmh_keys[0].fmr_reserved)) || memchr_inv(head.fmh_keys[1].fmr_reserved, 0, sizeof(head.fmh_keys[1].fmr_reserved))) return -EINVAL; /* * ext4 doesn't report file extents at all, so the only valid * file offsets are the magic ones (all zeroes or all ones). */ if (head.fmh_keys[0].fmr_offset || (head.fmh_keys[1].fmr_offset != 0 && head.fmh_keys[1].fmr_offset != -1ULL)) return -EINVAL; xhead.fmh_iflags = head.fmh_iflags; xhead.fmh_count = head.fmh_count; ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); info.gi_sb = sb; info.gi_data = arg; error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); if (error == EXT4_QUERY_RANGE_ABORT) aborted = true; else if (error) return error; /* If we didn't abort, set the "last" flag in the last fmx */ if (!aborted && info.gi_idx) { info.gi_last_flags |= FMR_OF_LAST; if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, &info.gi_last_flags, sizeof(info.gi_last_flags))) return -EFAULT; } /* copy back header */ head.fmh_entries = xhead.fmh_entries; head.fmh_oflags = xhead.fmh_oflags; if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) return -EFAULT; return 0; } static long ext4_ioctl_group_add(struct file *file, struct ext4_new_group_data *input) { struct super_block *sb = file_inode(file)->i_sb; int err, err2=0; err = ext4_resize_begin(sb); if (err) return err; if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; goto group_add_out; } err = mnt_want_write_file(file); if (err) goto group_add_out; err = ext4_group_add(sb, input); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(file); if (!err && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, input->group); group_add_out: err2 = ext4_resize_end(sb, false); if (err == 0) err = err2; return err; } int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct ext4_inode_info *ei = EXT4_I(inode); u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (S_ISREG(inode->i_mode)) flags &= ~FS_PROJINHERIT_FL; fileattr_fill_flags(fa, flags); if (ext4_has_feature_project(inode->i_sb)) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); return 0; } int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); u32 flags = fa->flags; int err = -EOPNOTSUPP; if (flags & ~EXT4_FL_USER_VISIBLE) goto out; /* * chattr(1) grabs flags via GETFLAGS, modifies the result and * passes that to SETFLAGS. So we cannot easily make SETFLAGS * more restrictive than just silently masking off visible but * not settable flags as we always did. */ flags &= EXT4_FL_USER_MODIFIABLE; if (ext4_mask_flags(inode->i_mode, flags) != flags) goto out; err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags); if (err) goto out; err = ext4_ioctl_setflags(inode, flags); if (err) goto out; err = ext4_ioctl_setproject(inode, fa->fsx_projid); out: return err; } /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) { struct fiemap fiemap; struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); int error; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; } static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) { int err = 0; __u32 flags = 0; unsigned int flush_flags = 0; struct super_block *sb = file_inode(filp)->i_sb; if (copy_from_user(&flags, (__u32 __user *)arg, sizeof(__u32))) return -EFAULT; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* check for invalid bits set */ if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) || ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) return -EINVAL; if (!EXT4_SB(sb)->s_journal) return -ENODEV; if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) return 0; if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD) flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD; if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) { flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT; pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow"); } jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return err; } static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label) { size_t len; int ret = 0; char new_label[EXT4_LABEL_MAX + 1]; struct super_block *sb = file_inode(filp)->i_sb; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * Copy the maximum length allowed for ext4 label with one more to * find the required terminating null byte in order to test the * label length. The on disk label doesn't need to be null terminated. */ if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1)) return -EFAULT; len = strnlen(new_label, EXT4_LABEL_MAX + 1); if (len > EXT4_LABEL_MAX) return -EINVAL; /* * Clear the buffer after the new label */ memset(new_label + len, 0, EXT4_LABEL_MAX - len); ret = mnt_want_write_file(filp); if (ret) return ret; ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label); mnt_drop_write_file(filp); return ret; } static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label) { char label[EXT4_LABEL_MAX + 1]; /* * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because * FSLABEL_MAX must include terminating null byte, while s_volume_name * does not have to. */ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); memset(label, 0, sizeof(label)); lock_buffer(sbi->s_sbh); strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); unlock_buffer(sbi->s_sbh); if (copy_to_user(user_label, label, sizeof(label))) return -EFAULT; return 0; } static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, struct fsuuid __user *ufsuuid) { struct fsuuid fsuuid; __u8 uuid[UUID_SIZE]; if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) return -EFAULT; if (fsuuid.fsu_len == 0) { fsuuid.fsu_len = UUID_SIZE; if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, sizeof(fsuuid.fsu_len))) return -EFAULT; return 0; } if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) return -EINVAL; lock_buffer(sbi->s_sbh); memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); unlock_buffer(sbi->s_sbh); fsuuid.fsu_len = UUID_SIZE; if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) return -EFAULT; return 0; } static int ext4_ioctl_setuuid(struct file *filp, const struct fsuuid __user *ufsuuid) { int ret = 0; struct super_block *sb = file_inode(filp)->i_sb; struct fsuuid fsuuid; __u8 uuid[UUID_SIZE]; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * If any checksums (group descriptors or metadata) are being used * then the checksum seed feature is required to change the UUID. */ if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) && !ext4_has_feature_csum_seed(sb)) || ext4_has_feature_stable_inodes(sb)) return -EOPNOTSUPP; if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) return -EFAULT; if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) return -EINVAL; if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE)) return -EFAULT; ret = mnt_want_write_file(filp); if (ret) return ret; ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid); mnt_drop_write_file(filp); return ret; } static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct mnt_idmap *idmap = file_mnt_idmap(filp); ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { case FS_IOC_GETFSMAP: return ext4_ioc_getfsmap(sb, (void __user *)arg); case EXT4_IOC_GETVERSION: case EXT4_IOC_GETVERSION_OLD: return put_user(inode->i_generation, (int __user *) arg); case EXT4_IOC_SETVERSION: case EXT4_IOC_SETVERSION_OLD: { handle_t *handle; struct ext4_iloc iloc; __u32 generation; int err; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ext4_has_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; } err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { err = -EFAULT; goto setversion_out; } inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto unlock_out; } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } ext4_journal_stop(handle); unlock_out: inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; } case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; int err, err2=0; err = ext4_resize_begin(sb); if (err) return err; if (get_user(n_blocks_count, (__u32 __user *)arg)) { err = -EFAULT; goto group_extend_out; } if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; goto group_extend_out; } err = mnt_want_write_file(filp); if (err) goto group_extend_out; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(filp); group_extend_out: err2 = ext4_resize_end(sb, false); if (err == 0) err = err2; return err; } case EXT4_IOC_MOVE_EXT: { struct move_extent me; struct fd donor; int err; if (!(filp->f_mode & FMODE_READ) || !(filp->f_mode & FMODE_WRITE)) return -EBADF; if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; me.moved_len = 0; donor = fdget(me.donor_fd); if (!donor.file) return -EBADF; if (!(donor.file->f_mode & FMODE_WRITE)) { err = -EBADF; goto mext_out; } if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; goto mext_out; } else if (IS_DAX(inode)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with DAX"); err = -EOPNOTSUPP; goto mext_out; } err = mnt_want_write_file(filp); if (err) goto mext_out; err = ext4_move_extents(filp, donor.file, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; mext_out: fdput(donor); return err; } case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) return -EFAULT; return ext4_ioctl_group_add(filp, &input); } case EXT4_IOC_MIGRATE: { int err; if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); if (err) return err; /* * inode_mutex prevent write and truncate on the file. * Read still goes through. We take i_data_sem in * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ inode_lock((inode)); err = ext4_ext_migrate(inode); inode_unlock((inode)); mnt_drop_write_file(filp); return err; } case EXT4_IOC_ALLOC_DA_BLKS: { int err; if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); if (err) return err; err = ext4_alloc_da_blocks(inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_SWAP_BOOT: { int err; if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; err = mnt_want_write_file(filp); if (err) return err; err = swap_inode_boot_loader(sb, idmap, inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; } err = ext4_resize_begin(sb); if (err) return err; err = mnt_want_write_file(filp); if (err) goto resizefs_out; err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(filp); if (!err && (o_group < EXT4_SB(sb)->s_groups_count) && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, o_group); resizefs_out: err2 = ext4_resize_end(sb, true); if (err == 0) err = err2; return err; } case FITRIM: { struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; /* * We haven't replayed the journal, so we cannot use our * block-bitmap-guided storage zapping commands. */ if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) return -EROFS; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; return 0; } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); case FS_IOC_GET_ENCRYPTION_PWSALT: return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_POLICY_EX: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); case FS_IOC_ADD_ENCRYPTION_KEY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_add_key(filp, (void __user *)arg); case FS_IOC_REMOVE_ENCRYPTION_KEY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_remove_key(filp, (void __user *)arg); case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_KEY_STATUS: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_NONCE: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); case EXT4_IOC_CLEAR_ES_CACHE: { if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ext4_clear_inode_es(inode); return 0; } case EXT4_IOC_GETSTATE: { __u32 state = 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED)) state |= EXT4_STATE_FLAG_EXT_PRECACHED; if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) state |= EXT4_STATE_FLAG_NEW; if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) state |= EXT4_STATE_FLAG_NEWENTRY; if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE; return put_user(state, (__u32 __user *) arg); } case EXT4_IOC_GET_ES_CACHE: return ext4_ioctl_get_es_cache(filp, arg); case EXT4_IOC_SHUTDOWN: return ext4_ioctl_shutdown(sb, arg); case FS_IOC_ENABLE_VERITY: if (!ext4_has_feature_verity(sb)) return -EOPNOTSUPP; return fsverity_ioctl_enable(filp, (const void __user *)arg); case FS_IOC_MEASURE_VERITY: if (!ext4_has_feature_verity(sb)) return -EOPNOTSUPP; return fsverity_ioctl_measure(filp, (void __user *)arg); case FS_IOC_READ_VERITY_METADATA: if (!ext4_has_feature_verity(sb)) return -EOPNOTSUPP; return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); case EXT4_IOC_CHECKPOINT: return ext4_ioctl_checkpoint(filp, arg); case FS_IOC_GETFSLABEL: return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg); case FS_IOC_SETFSLABEL: return ext4_ioctl_setlabel(filp, (const void __user *)arg); case EXT4_IOC_GETFSUUID: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); default: return -ENOTTY; } } long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return __ext4_ioctl(filp, cmd, arg); } #ifdef CONFIG_COMPAT long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { case EXT4_IOC32_GETVERSION: cmd = EXT4_IOC_GETVERSION; break; case EXT4_IOC32_SETVERSION: cmd = EXT4_IOC_SETVERSION; break; case EXT4_IOC32_GROUP_EXTEND: cmd = EXT4_IOC_GROUP_EXTEND; break; case EXT4_IOC32_GETVERSION_OLD: cmd = EXT4_IOC_GETVERSION_OLD; break; case EXT4_IOC32_SETVERSION_OLD: cmd = EXT4_IOC_SETVERSION_OLD; break; case EXT4_IOC32_GETRSVSZ: cmd = EXT4_IOC_GETRSVSZ; break; case EXT4_IOC32_SETRSVSZ: cmd = EXT4_IOC_SETRSVSZ; break; case EXT4_IOC32_GROUP_ADD: { struct compat_ext4_new_group_input __user *uinput; struct ext4_new_group_data input; int err; uinput = compat_ptr(arg); err = get_user(input.group, &uinput->group); err |= get_user(input.block_bitmap, &uinput->block_bitmap); err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); err |= get_user(input.inode_table, &uinput->inode_table); err |= get_user(input.blocks_count, &uinput->blocks_count); err |= get_user(input.reserved_blocks, &uinput->reserved_blocks); if (err) return -EFAULT; return ext4_ioctl_group_add(file, &input); } case EXT4_IOC_MOVE_EXT: case EXT4_IOC_RESIZE_FS: case FITRIM: case EXT4_IOC_PRECACHE_EXTENTS: case FS_IOC_SET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_PWSALT: case FS_IOC_GET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY_EX: case FS_IOC_ADD_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case FS_IOC_GET_ENCRYPTION_NONCE: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: case FS_IOC_READ_VERITY_METADATA: case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: case EXT4_IOC_CHECKPOINT: case FS_IOC_GETFSLABEL: case FS_IOC_SETFSLABEL: case EXT4_IOC_GETFSUUID: case EXT4_IOC_SETFSUUID: break; default: return -ENOIOCTLCMD; } return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif static void set_overhead(struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (sb_rdonly(sb)) return 0; if (!force && (sbi->s_overhead == 0 || sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))) return 0; return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); } |
3 1 1 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | /* * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin * cleaned up code to current version of sparse and added the slicing-by-8 * algorithm to the closely similar existing slicing-by-4 algorithm. * * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was * subsequently included in the kernel, thus was re-licensed under the * GNU GPL v2. * * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> * Same crc32 function was used in 5 other places in the kernel. * I made one version, and deleted the others. * There are various incantations of crc32(). Some use a seed of 0 or ~0. * Some xor at the end with ~0. The generic crc32() function takes * seed as an argument, and doesn't xor at the end. Then individual * users can do whatever they need. * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. * fs/jffs2 uses seed 0, doesn't xor with ~0. * fs/partitions/efi.c uses seed ~0, xor's with ~0. * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ /* see: Documentation/staging/crc32.rst for a description of algorithms */ #include <linux/crc32.h> #include <linux/crc32poly.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include "crc32defs.h" #if CRC_LE_BITS > 8 # define tole(x) ((__force u32) cpu_to_le32(x)) #else # define tole(x) (x) #endif #if CRC_BE_BITS > 8 # define tobe(x) ((__force u32) cpu_to_be32(x)) #else # define tobe(x) (x) #endif #include "crc32table.h" MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>"); MODULE_DESCRIPTION("Various CRC32 calculations"); MODULE_LICENSE("GPL"); #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8 /* implements slicing-by-4 or slicing-by-8 algorithm */ static inline u32 __pure crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN # define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) # define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \ t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255]) # define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \ t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255]) # else # define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) # define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \ t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255]) # define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \ t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255]) # endif const u32 *b; size_t rem_len; # ifdef CONFIG_X86 size_t i; # endif const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; # if CRC_LE_BITS != 32 const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; # endif u32 q; /* Align it */ if (unlikely((long)buf & 3 && len)) { do { DO_CRC(*buf++); } while ((--len) && ((long)buf)&3); } # if CRC_LE_BITS == 32 rem_len = len & 3; len = len >> 2; # else rem_len = len & 7; len = len >> 3; # endif b = (const u32 *)buf; # ifdef CONFIG_X86 --b; for (i = 0; i < len; i++) { # else for (--b; len; --len) { # endif q = crc ^ *++b; /* use pre increment for speed */ # if CRC_LE_BITS == 32 crc = DO_CRC4; # else crc = DO_CRC8; q = *++b; crc ^= DO_CRC4; # endif } len = rem_len; /* And the last few bytes */ if (len) { u8 *p = (u8 *)(b + 1) - 1; # ifdef CONFIG_X86 for (i = 0; i < len; i++) DO_CRC(*++p); /* use pre increment for speed */ # else do { DO_CRC(*++p); /* use pre increment for speed */ } while (--len); # endif } return crc; #undef DO_CRC #undef DO_CRC4 #undef DO_CRC8 } #endif /** * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II * CRC32/CRC32C * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other * uses, or the previous crc32/crc32c value if computing incrementally. * @p: pointer to buffer over which CRC32/CRC32C is run * @len: length of buffer @p * @tab: little-endian Ethernet table * @polynomial: CRC32/CRC32c LE polynomial */ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i < 8; i++) crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; crc = (crc >> 4) ^ tab[0][crc & 15]; crc = (crc >> 4) ^ tab[0][crc & 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; crc = (crc >> 8) ^ tab[0][crc & 255]; } # else crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } #if CRC_LE_BITS == 1 u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE); } u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE); } #else u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE); } u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); } #endif EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be); /* * This multiplies the polynomials x and y modulo the given modulus. * This follows the "little-endian" CRC convention that the lsbit * represents the highest power of x, and the msbit represents x^0. */ static u32 __attribute_const__ gf2_multiply(u32 x, u32 y, u32 modulus) { u32 product = x & 1 ? y : 0; int i; for (i = 0; i < 31; i++) { product = (product >> 1) ^ (product & 1 ? modulus : 0); x >>= 1; product ^= x & 1 ? y : 0; } return product; } /** * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient) * @len: The number of bytes. @crc is multiplied by x^(8*@len) * @polynomial: The modulus used to reduce the result to 32 bits. * * It's possible to parallelize CRC computations by computing a CRC * over separate ranges of a buffer, then summing them. * This shifts the given CRC by 8*len bits (i.e. produces the same effect * as appending len bytes of zero to the data), in time proportional * to log(len). */ static u32 __attribute_const__ crc32_generic_shift(u32 crc, size_t len, u32 polynomial) { u32 power = polynomial; /* CRC of x^32 */ int i; /* Shift up to 32 bits in the simple linear way */ for (i = 0; i < 8 * (int)(len & 3); i++) crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0); len >>= 2; if (!len) return crc; for (;;) { /* "power" is x^(2^i), modulo the polynomial */ if (len & 1) crc = gf2_multiply(crc, power, polynomial); len >>= 1; if (!len) break; /* Square power, advancing to x^(2^(i+1)) */ power = gf2_multiply(power, power, polynomial); } return crc; } u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32_POLY_LE); } u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32C_POLY_LE); } EXPORT_SYMBOL(crc32_le_shift); EXPORT_SYMBOL(__crc32c_le_shift); /** * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for * other uses, or the previous crc32 value if computing incrementally. * @p: pointer to buffer over which CRC32 is run * @len: length of buffer @p * @tab: big-endian Ethernet table * @polynomial: CRC32 BE polynomial */ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], u32 polynomial) { #if CRC_BE_BITS == 1 int i; while (len--) { crc ^= *p++ << 24; for (i = 0; i < 8; i++) crc = (crc << 1) ^ ((crc & 0x80000000) ? polynomial : 0); } # elif CRC_BE_BITS == 2 while (len--) { crc ^= *p++ << 24; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; } # elif CRC_BE_BITS == 4 while (len--) { crc ^= *p++ << 24; crc = (crc << 4) ^ tab[0][crc >> 28]; crc = (crc << 4) ^ tab[0][crc >> 28]; } # elif CRC_BE_BITS == 8 while (len--) { crc ^= *p++ << 24; crc = (crc << 8) ^ tab[0][crc >> 24]; } # else crc = (__force u32) __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); crc = __be32_to_cpu((__force __be32)crc); # endif return crc; } #if CRC_BE_BITS == 1 u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE); } #else u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE); } #endif EXPORT_SYMBOL(crc32_be); |
716 714 459 459 458 285 320 320 320 46 51 2 300 300 300 347 346 346 347 344 166 5 11 21 20 13 21 20 21 7 7 7 7 19 11 5 5 21 21 17 7 21 19 19 21 21 21 21 21 21 15 21 21 21 293 293 292 43 1 1 9 293 293 265 18 261 15 12 229 94 280 32 8 8 24 42 40 40 14 36 40 30 11 2 2 2 310 311 310 103 103 311 310 310 168 169 142 41 169 311 284 16 284 210 251 252 16 247 9 33 33 1 1 210 210 219 219 63 270 270 261 261 262 261 55 262 262 261 45 9 1 9 4 9 4 9 9 9 257 2 339 338 339 279 293 25 25 2 293 289 22 8 8 5 3 15 17 17 1 16 269 269 38 6 5 6 5 5 5 11 292 43 43 6 5 770 770 769 570 704 770 771 771 770 771 771 770 771 7 770 355 355 770 769 769 771 38 38 38 38 38 38 15 14 13 3 38 38 4 4 4 2 38 153 153 153 153 153 6 1 5 153 168 11 169 169 169 169 169 169 169 169 169 169 169 83 165 148 148 168 165 2 2 165 165 129 128 128 165 165 165 145 145 143 145 144 85 80 165 165 165 302 302 170 170 11 302 302 300 277 277 276 265 302 169 164 301 302 263 302 301 300 298 302 302 302 302 302 2 257 719 720 720 719 717 73 717 718 718 717 720 716 718 720 719 718 719 718 718 720 719 73 73 719 720 710 713 717 716 699 219 220 676 699 698 678 677 276 677 2 2 677 699 699 699 698 619 220 238 18 18 18 237 238 237 238 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) { struct fib6_node *fn = container_of(head, struct fib6_node, rcu); kmem_cache_free(fib6_node_kmem, fn); } static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; if (cb->strict_check) { int err; err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) return err; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; /* * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; } if (!cb->args[0]) { res = fib6_dump_table(tb, skb, cb); if (!res) cb->args[0] = 1; } goto out; } s_h = cb->args[0]; s_e = cb->args[1]; rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); if (res != 0) goto out_unlock; next: e++; } } out_unlock: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; out: res = res < 0 ? res : skb->len; if (res <= 0) fib6_dump_end(cb); return res; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); fib6_info_release(from); } } } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) fib6_clean_expires(iter); else fib6_set_expires(iter, rt->expires); if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } pn = fn; #ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; RT6_TRACE("fib6_del_route\n"); /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */ |
272 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */ |
830 830 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_FLOW_DISSECTOR_H #define _NET_FLOW_DISSECTOR_H #include <linux/types.h> #include <linux/in6.h> #include <linux/siphash.h> #include <linux/string.h> #include <uapi/linux/if_ether.h> struct bpf_prog; struct net; struct sk_buff; /** * struct flow_dissector_key_control: * @thoff: Transport header offset * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_* * @flags: Key flags. Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAGENCAPSULATION) */ struct flow_dissector_key_control { u16 thoff; u16 addr_type; u32 flags; }; #define FLOW_DIS_IS_FRAGMENT BIT(0) #define FLOW_DIS_FIRST_FRAG BIT(1) #define FLOW_DIS_ENCAPSULATION BIT(2) enum flow_dissect_ret { FLOW_DISSECT_RET_OUT_GOOD, FLOW_DISSECT_RET_OUT_BAD, FLOW_DISSECT_RET_PROTO_AGAIN, FLOW_DISSECT_RET_IPPROTO_AGAIN, FLOW_DISSECT_RET_CONTINUE, }; /** * struct flow_dissector_key_basic: * @n_proto: Network header protocol (eg. IPv4/IPv6) * @ip_proto: Transport header protocol (eg. TCP/UDP) * @padding: Unused */ struct flow_dissector_key_basic { __be16 n_proto; u8 ip_proto; u8 padding; }; struct flow_dissector_key_tags { u32 flow_label; }; struct flow_dissector_key_vlan { union { struct { u16 vlan_id:12, vlan_dei:1, vlan_priority:3; }; __be16 vlan_tci; }; __be16 vlan_tpid; __be16 vlan_eth_type; u16 padding; }; struct flow_dissector_mpls_lse { u32 mpls_ttl:8, mpls_bos:1, mpls_tc:3, mpls_label:20; }; #define FLOW_DIS_MPLS_MAX 7 struct flow_dissector_key_mpls { struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */ u8 used_lses; /* One bit set for each Label Stack Entry in use */ }; static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls, int lse_index) { mpls->used_lses |= 1 << lse_index; } #define FLOW_DIS_TUN_OPTS_MAX 255 /** * struct flow_dissector_key_enc_opts: * @data: tunnel option data * @len: length of tunnel option data * @dst_opt_type: tunnel option type */ struct flow_dissector_key_enc_opts { u8 data[FLOW_DIS_TUN_OPTS_MAX]; /* Using IP_TUNNEL_OPTS_MAX is desired * here but seems difficult to #include */ u8 len; __be16 dst_opt_type; }; struct flow_dissector_key_keyid { __be32 keyid; }; /** * struct flow_dissector_key_ipv4_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv4_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ __be32 src; __be32 dst; }; /** * struct flow_dissector_key_ipv6_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv6_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ struct in6_addr src; struct in6_addr dst; }; /** * struct flow_dissector_key_tipc: * @key: source node address combined with selector */ struct flow_dissector_key_tipc { __be32 key; }; /** * struct flow_dissector_key_addrs: * @v4addrs: IPv4 addresses * @v6addrs: IPv6 addresses * @tipckey: TIPC key */ struct flow_dissector_key_addrs { union { struct flow_dissector_key_ipv4_addrs v4addrs; struct flow_dissector_key_ipv6_addrs v6addrs; struct flow_dissector_key_tipc tipckey; }; }; /** * struct flow_dissector_key_arp: * @sip: Sender IP address * @tip: Target IP address * @op: Operation * @sha: Sender hardware address * @tha: Target hardware address */ struct flow_dissector_key_arp { __u32 sip; __u32 tip; __u8 op; unsigned char sha[ETH_ALEN]; unsigned char tha[ETH_ALEN]; }; /** * struct flow_dissector_key_ports: * @ports: port numbers of Transport header * @src: source port number * @dst: destination port number */ struct flow_dissector_key_ports { union { __be32 ports; struct { __be16 src; __be16 dst; }; }; }; /** * struct flow_dissector_key_ports_range * @tp: port number from packet * @tp_min: min port number in range * @tp_max: max port number in range */ struct flow_dissector_key_ports_range { union { struct flow_dissector_key_ports tp; struct { struct flow_dissector_key_ports tp_min; struct flow_dissector_key_ports tp_max; }; }; }; /** * struct flow_dissector_key_icmp: * @type: ICMP type * @code: ICMP code * @id: Session identifier */ struct flow_dissector_key_icmp { struct { u8 type; u8 code; }; u16 id; }; /** * struct flow_dissector_key_eth_addrs: * @src: source Ethernet address * @dst: destination Ethernet address */ struct flow_dissector_key_eth_addrs { /* (dst,src) must be grouped, in the same way than in ETH header */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; }; /** * struct flow_dissector_key_tcp: * @flags: flags */ struct flow_dissector_key_tcp { __be16 flags; }; /** * struct flow_dissector_key_ip: * @tos: tos * @ttl: ttl */ struct flow_dissector_key_ip { __u8 tos; __u8 ttl; }; /** * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex * @ingress_iftype: ingress interface type * @l2_miss: packet did not match an L2 entry during forwarding */ struct flow_dissector_key_meta { int ingress_ifindex; u16 ingress_iftype; u8 l2_miss; }; /** * struct flow_dissector_key_ct: * @ct_state: conntrack state after converting with map * @ct_mark: conttrack mark * @ct_zone: conntrack zone * @ct_labels: conntrack labels */ struct flow_dissector_key_ct { u16 ct_state; u16 ct_zone; u32 ct_mark; u32 ct_labels[4]; }; /** * struct flow_dissector_key_hash: * @hash: hash value */ struct flow_dissector_key_hash { u32 hash; }; /** * struct flow_dissector_key_num_of_vlans: * @num_of_vlans: num_of_vlans value */ struct flow_dissector_key_num_of_vlans { u8 num_of_vlans; }; /** * struct flow_dissector_key_pppoe: * @session_id: pppoe session id * @ppp_proto: ppp protocol * @type: pppoe eth type */ struct flow_dissector_key_pppoe { __be16 session_id; __be16 ppp_proto; __be16 type; }; /** * struct flow_dissector_key_l2tpv3: * @session_id: identifier for a l2tp session */ struct flow_dissector_key_l2tpv3 { __be32 session_id; }; /** * struct flow_dissector_key_ipsec: * @spi: identifier for a ipsec connection */ struct flow_dissector_key_ipsec { __be32 spi; }; /** * struct flow_dissector_key_cfm * @mdl_ver: maintenance domain level (mdl) and cfm protocol version * @opcode: code specifying a type of cfm protocol packet * * See 802.1ag, ITU-T G.8013/Y.1731 * 1 2 * |7 6 5 4 3 2 1 0|7 6 5 4 3 2 1 0| * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | mdl | version | opcode | * +-----+---------+-+-+-+-+-+-+-+-+ */ struct flow_dissector_key_cfm { u8 mdl_ver; u8 opcode; }; #define FLOW_DIS_CFM_MDL_MASK GENMASK(7, 5) #define FLOW_DIS_CFM_MDL_MAX 7 enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */ FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */ FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */ FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */ FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */ FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */ FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */ FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */ FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */ FLOW_DISSECTOR_KEY_MAX, }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) #define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP BIT(3) struct flow_dissector_key { enum flow_dissector_key_id key_id; size_t offset; /* offset of struct flow_dissector_key_* in target the struct */ }; struct flow_dissector { unsigned long long used_keys; /* each bit represents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; struct flow_keys_basic { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; }; struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT); struct flow_dissector_key_tags tags; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; #define FLOW_KEYS_HASH_OFFSET \ offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD) __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * * This structure is used to hold a digest of the full flow keys. This is a * larger "hash" of a flow to allow definitively matching specific flows where * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so * that it can be used in CB of skb (see sch_choke for an example). */ #define FLOW_KEYS_DIGEST_LEN 16 struct flow_keys_digest { u8 data[FLOW_KEYS_DIGEST_LEN]; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); static inline bool flow_keys_have_l4(const struct flow_keys *keys) { return (keys->ports.ports || keys->tags.flow_label); } u32 flow_hash_from_keys(struct flow_keys *keys); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1ULL << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id, void *target_container) { return ((char *)target_container) + flow_dissector->offset[key_id]; } struct bpf_flow_dissector { struct bpf_flow_keys *flow_keys; const struct sk_buff *skb; const void *data; const void *data_end; }; static inline void flow_dissector_init_keys(struct flow_dissector_key_control *key_control, struct flow_dissector_key_basic *key_basic) { memset(key_control, 0, sizeof(*key_control)); memset(key_basic, 0, sizeof(*key_basic)); } #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif |
4 2 2 1 1 1 4 3 5 1 5 22 18 22 16 14 5 9 2 2 9 1 1 1 3 3 2 5 4 2 2 5 3 2 3 20 20 20 14 1 14 14 4 13 3 1 1 9 9 2 1 1 1 1 1 2 2 2 1 2 1 1 1 1 4 4 1 4 4 4 3 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include <linux/bpf.h> #include <linux/bpf_mprog.h> static int bpf_mprog_link(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_link *link = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) link = bpf_link_by_id(id_or_fd); else if (id_or_fd) link = bpf_link_get_from_fd(id_or_fd); if (IS_ERR(link)) return PTR_ERR(link); if (type && link->prog->type != type) { bpf_link_put(link); return -EINVAL; } tuple->link = link; tuple->prog = link->prog; return 0; } static int bpf_mprog_prog(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_prog *prog = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) prog = bpf_prog_by_id(id_or_fd); else if (id_or_fd) prog = bpf_prog_get(id_or_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (type && prog->type != type) { bpf_prog_put(prog); return -EINVAL; } tuple->link = NULL; tuple->prog = prog; return 0; } static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { bool link = flags & BPF_F_LINK; bool id = flags & BPF_F_ID; memset(tuple, 0, sizeof(*tuple)); if (link) return bpf_mprog_link(tuple, id_or_fd, flags, type); /* If no relevant flag is set and no id_or_fd was passed, then * tuple link/prog is just NULLed. This is the case when before/ * after selects first/last position without passing fd. */ if (!id && !id_or_fd) return 0; return bpf_mprog_prog(tuple, id_or_fd, flags, type); } static void bpf_mprog_tuple_put(struct bpf_tuple *tuple) { if (tuple->link) bpf_link_put(tuple->link); else if (tuple->prog) bpf_prog_put(tuple->prog); } /* The bpf_mprog_{replace,delete}() operate on exact idx position with the * one exception that for deletion we support delete from front/back. In * case of front idx is -1, in case of back idx is bpf_mprog_total(entry). * Adjustment to first and last entry is trivial. The bpf_mprog_insert() * we have to deal with the following cases: * * idx + before: * * Insert P4 before P3: idx for old array is 1, idx for new array is 2, * hence we adjust target idx for the new array, so that memmove copies * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting * before P1 would have old idx -1 and new idx 0. * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * * idx + after: * * Insert P4 after P2: idx for old array is 2, idx for new array is 2. * Again, memmove copies P1 and P2 to the new entry, and we insert P4 * into idx 2. Inserting after P3 would have both old/new idx at 4 aka * bpf_mprog_total(entry). * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ */ static int bpf_mprog_replace(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *oprog; bpf_mprog_read(entry, idx, &fp, &cp); oprog = READ_ONCE(fp->prog); bpf_mprog_write(fp, cp, ntuple); if (!ntuple->link) { WARN_ON_ONCE(cp->link); bpf_prog_put(oprog); } *entry_new = entry; return 0; } static int bpf_mprog_insert(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx, u32 flags) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == total) goto insert; else if (flags & BPF_F_BEFORE) idx += 1; bpf_mprog_entry_grow(peer, idx); insert: bpf_mprog_read(peer, idx, &fp, &cp); bpf_mprog_write(fp, cp, ntuple); bpf_mprog_inc(peer); *entry_new = peer; return 0; } static int bpf_mprog_delete(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *dtuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_entry_shrink(peer, idx); bpf_mprog_dec(peer); bpf_mprog_mark_for_release(peer, dtuple); *entry_new = peer; return 0; } /* In bpf_mprog_pos_*() we evaluate the target position for the BPF * program/link that needs to be replaced, inserted or deleted for * each "rule" independently. If all rules agree on that position * or existing element, then enact replacement, addition or deletion. * If this is not the case, then the request cannot be satisfied and * we bail out with an error. */ static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog)) return tuple->link == cp->link ? i : -EBUSY; } return -ENOENT; } static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i - 1; } return tuple->prog ? -ENOENT : -1; } static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i + 1; } return tuple->prog ? -ENOENT : bpf_mprog_total(entry); } int bpf_mprog_attach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog_new, struct bpf_link *link, struct bpf_prog *prog_old, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, ntuple = { .prog = prog_new, .link = link, }, otuple = { .prog = prog_old, .link = link, }; int ret, idx = -ERANGE, tidx; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (bpf_mprog_exists(entry, prog_new)) return -EEXIST; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags & ~BPF_F_REPLACE, prog_new->type); if (ret) return ret; if (flags & BPF_F_REPLACE) { tidx = bpf_mprog_pos_exact(entry, &otuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_REPLACE) ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx); else ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags); out: bpf_mprog_tuple_put(&rtuple); return ret; } static int bpf_mprog_fetch(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_cp *cp; struct bpf_mprog_fp *fp; struct bpf_prog *prog; struct bpf_link *link; if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_read(entry, idx, &fp, &cp); prog = READ_ONCE(fp->prog); link = cp->link; /* The deletion request can either be without filled tuple in which * case it gets populated here based on idx, or with filled tuple * where the only thing we end up doing is the WARN_ON_ONCE() assert. * If we hit a BPF link at the given index, it must not be removed * from opts path. */ if (link && !tuple->link) return -EBUSY; WARN_ON_ONCE(tuple->prog && tuple->prog != prog); WARN_ON_ONCE(tuple->link && tuple->link != link); tuple->prog = prog; tuple->link = link; return 0; } int bpf_mprog_detach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog, struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, dtuple = { .prog = prog, .link = link, }; int ret, idx = -ERANGE, tidx; if (flags & BPF_F_REPLACE) return -EINVAL; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (!bpf_mprog_total(entry)) return -ENOENT; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags, prog ? prog->type : BPF_PROG_TYPE_UNSPEC); if (ret) return ret; if (dtuple.prog) { tidx = bpf_mprog_pos_exact(entry, &dtuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } ret = bpf_mprog_fetch(entry, &dtuple, idx); if (ret) goto out; ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx); out: bpf_mprog_tuple_put(&rtuple); return ret; } int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_entry *entry) { u32 __user *uprog_flags, *ulink_flags; u32 __user *uprog_id, *ulink_id; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; u32 id, count = 0; u64 revision = 1; int i, ret = 0; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; if (entry) { revision = bpf_mprog_revision(entry); count = bpf_mprog_total(entry); } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (copy_to_user(&uattr->query.count, &count, sizeof(count))) return -EFAULT; uprog_id = u64_to_user_ptr(attr->query.prog_ids); uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags); ulink_id = u64_to_user_ptr(attr->query.link_ids); ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags); if (attr->query.count == 0 || !uprog_id || !count) return 0; if (attr->query.count < count) { count = attr->query.count; ret = -ENOSPC; } for (i = 0; i < bpf_mprog_max(); i++) { bpf_mprog_read(entry, i, &fp, &cp); prog = READ_ONCE(fp->prog); if (!prog) break; id = prog->aux->id; if (copy_to_user(uprog_id + i, &id, sizeof(id))) return -EFAULT; if (uprog_flags && copy_to_user(uprog_flags + i, &flags, sizeof(flags))) return -EFAULT; id = cp->link ? cp->link->id : 0; if (ulink_id && copy_to_user(ulink_id + i, &id, sizeof(id))) return -EFAULT; if (ulink_flags && copy_to_user(ulink_flags + i, &flags, sizeof(flags))) return -EFAULT; if (i + 1 == count) break; } return ret; } |
635 101 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SOCKET_H #define _LINUX_SOCKET_H #include <asm/socket.h> /* arch-dependent defines */ #include <linux/sockios.h> /* the SIOCxxx I/O controls */ #include <linux/uio.h> /* iovec support */ #include <linux/types.h> /* pid_t */ #include <linux/compiler.h> /* __user */ #include <uapi/linux/socket.h> struct file; struct pid; struct cred; struct socket; struct sock; struct sk_buff; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) #ifdef CONFIG_PROC_FS struct seq_file; extern void socket_seq_show(struct seq_file *seq); #endif typedef __kernel_sa_family_t sa_family_t; /* * 1003.1g requires sa_family_t and that sa_data is char. */ struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ union { char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ DECLARE_FLEX_ARRAY(char, sa_data); }; }; struct linger { int l_onoff; /* Linger active */ int l_linger; /* How long to linger for */ }; #define sockaddr_storage __kernel_sockaddr_storage /* * As we do 4.4BSD message passing we use a 4.4BSD message passing * system, not 4.3. Thus msg_accrights(len) are now missing. They * belong in an obscure libc emulation or the bin. */ struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ int msg_inq; /* output, data left in socket */ struct iov_iter msg_iter; /* data */ /* * Ancillary data. msg_control_user is the user buffer used for the * recv* side when msg_control_is_user is set, msg_control is the kernel * buffer used for all other cases. */ union { void *msg_control; void __user *msg_control_user; }; bool msg_control_is_user : 1; bool msg_get_inq : 1;/* return INQ after receive */ unsigned int msg_flags; /* flags on received message */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); }; struct user_msghdr { void __user *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iovec __user *msg_iov; /* scatter/gather array */ __kernel_size_t msg_iovlen; /* # elements in msg_iov */ void __user *msg_control; /* ancillary data */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ }; /* For recvmmsg/sendmmsg */ struct mmsghdr { struct user_msghdr msg_hdr; unsigned int msg_len; }; /* * POSIX 1003.1g - ancillary data object information * Ancillary data consists of a sequence of pairs of * (cmsghdr, cmsg_data[]) */ struct cmsghdr { __kernel_size_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ }; /* * Ancillary data object information MACROS * Table 5-14 of POSIX 1003.1g */ #define __CMSG_NXTHDR(ctl, len, cmsg) __cmsg_nxthdr((ctl),(len),(cmsg)) #define CMSG_NXTHDR(mhdr, cmsg) cmsg_nxthdr((mhdr), (cmsg)) #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) #define CMSG_DATA(cmsg) \ ((void *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_USER_DATA(cmsg) \ ((void __user *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len)) #define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(ctl) : \ (struct cmsghdr *)NULL) #define CMSG_FIRSTHDR(msg) __CMSG_FIRSTHDR((msg)->msg_control, (msg)->msg_controllen) #define CMSG_OK(mhdr, cmsg) ((cmsg)->cmsg_len >= sizeof(struct cmsghdr) && \ (cmsg)->cmsg_len <= (unsigned long) \ ((mhdr)->msg_controllen - \ ((char *)(cmsg) - (char *)(mhdr)->msg_control))) #define for_each_cmsghdr(cmsg, msg) \ for (cmsg = CMSG_FIRSTHDR(msg); \ cmsg; \ cmsg = CMSG_NXTHDR(msg, cmsg)) /* * Get the next cmsg header * * PLEASE, do not touch this function. If you think, that it is * incorrect, grep kernel sources and think about consequences * before trying to improve it. * * Now it always returns valid, not truncated ancillary object * HEADER. But caller still MUST check, that cmsg->cmsg_len is * inside range, given by msg->msg_controllen before using * ancillary object DATA. --ANK (980731) */ static inline struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size, struct cmsghdr *__cmsg) { struct cmsghdr * __ptr; __ptr = (struct cmsghdr*)(((unsigned char *) __cmsg) + CMSG_ALIGN(__cmsg->cmsg_len)); if ((unsigned long)((char*)(__ptr+1) - (char *) __ctl) > __size) return (struct cmsghdr *)0; return __ptr; } static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__cmsg) { return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } static inline size_t msg_data_left(struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_SECURITY 0x03 /* rw: security label */ #define SCM_PIDFD 0x04 /* ro: pidfd (int) */ struct ucred { __u32 pid; __u32 uid; __u32 gid; }; /* Supported address families. */ #define AF_UNSPEC 0 #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ #define AF_AX25 3 /* Amateur Radio AX.25 */ #define AF_IPX 4 /* Novell IPX */ #define AF_APPLETALK 5 /* AppleTalk DDP */ #define AF_NETROM 6 /* Amateur Radio NET/ROM */ #define AF_BRIDGE 7 /* Multiprotocol bridge */ #define AF_ATMPVC 8 /* ATM PVCs */ #define AF_X25 9 /* Reserved for X.25 project */ #define AF_INET6 10 /* IP version 6 */ #define AF_ROSE 11 /* Amateur Radio X.25 PLP */ #define AF_DECnet 12 /* Reserved for DECnet project */ #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ #define AF_SECURITY 14 /* Security callback pseudo AF */ #define AF_KEY 15 /* PF_KEY key management API */ #define AF_NETLINK 16 #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ #define AF_PACKET 17 /* Packet family */ #define AF_ASH 18 /* Ash */ #define AF_ECONET 19 /* Acorn Econet */ #define AF_ATMSVC 20 /* ATM SVCs */ #define AF_RDS 21 /* RDS sockets */ #define AF_SNA 22 /* Linux SNA Project (nutters!) */ #define AF_IRDA 23 /* IRDA sockets */ #define AF_PPPOX 24 /* PPPoX sockets */ #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_IB 27 /* Native InfiniBand address */ #define AF_MPLS 28 /* MPLS */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ #define AF_IUCV 32 /* IUCV sockets */ #define AF_RXRPC 33 /* RxRPC sockets */ #define AF_ISDN 34 /* mISDN sockets */ #define AF_PHONET 35 /* Phonet sockets */ #define AF_IEEE802154 36 /* IEEE802154 sockets */ #define AF_CAIF 37 /* CAIF sockets */ #define AF_ALG 38 /* Algorithm sockets */ #define AF_NFC 39 /* NFC sockets */ #define AF_VSOCK 40 /* vSockets */ #define AF_KCM 41 /* Kernel Connection Multiplexor*/ #define AF_QIPCRTR 42 /* Qualcomm IPC Router */ #define AF_SMC 43 /* smc sockets: reserve number for * PF_SMC protocol family that * reuses AF_INET address family */ #define AF_XDP 44 /* XDP sockets */ #define AF_MCTP 45 /* Management component * transport protocol */ #define AF_MAX 46 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC #define PF_UNIX AF_UNIX #define PF_LOCAL AF_LOCAL #define PF_INET AF_INET #define PF_AX25 AF_AX25 #define PF_IPX AF_IPX #define PF_APPLETALK AF_APPLETALK #define PF_NETROM AF_NETROM #define PF_BRIDGE AF_BRIDGE #define PF_ATMPVC AF_ATMPVC #define PF_X25 AF_X25 #define PF_INET6 AF_INET6 #define PF_ROSE AF_ROSE #define PF_DECnet AF_DECnet #define PF_NETBEUI AF_NETBEUI #define PF_SECURITY AF_SECURITY #define PF_KEY AF_KEY #define PF_NETLINK AF_NETLINK #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET #define PF_ASH AF_ASH #define PF_ECONET AF_ECONET #define PF_ATMSVC AF_ATMSVC #define PF_RDS AF_RDS #define PF_SNA AF_SNA #define PF_IRDA AF_IRDA #define PF_PPPOX AF_PPPOX #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_IB AF_IB #define PF_MPLS AF_MPLS #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IUCV AF_IUCV #define PF_RXRPC AF_RXRPC #define PF_ISDN AF_ISDN #define PF_PHONET AF_PHONET #define PF_IEEE802154 AF_IEEE802154 #define PF_CAIF AF_CAIF #define PF_ALG AF_ALG #define PF_NFC AF_NFC #define PF_VSOCK AF_VSOCK #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC #define PF_XDP AF_XDP #define PF_MCTP AF_MCTP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ #define SOMAXCONN 4096 /* Flags we can use with send/ and recv. Added those for 1003.1g not all are supported yet */ #define MSG_OOB 1 #define MSG_PEEK 2 #define MSG_DONTROUTE 4 #define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ #define MSG_CTRUNC 8 #define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ #define MSG_TRUNC 0x20 #define MSG_DONTWAIT 0x40 /* Nonblocking io */ #define MSG_EOR 0x80 /* End of record */ #define MSG_WAITALL 0x100 /* Wait for a full request */ #define MSG_FIN 0x200 #define MSG_SYN 0x400 #define MSG_CONFIRM 0x800 /* Confirm path validity */ #define MSG_RST 0x1000 #define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ #define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry * plain text and require encryption */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through SCM_RIGHTS */ #if defined(CONFIG_COMPAT) #define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */ #else #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ #define MSG_INTERNAL_SENDMSG_FLAGS \ (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 /* #define SOL_ICMP 1 No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */ #define SOL_TCP 6 #define SOL_UDP 17 #define SOL_IPV6 41 #define SOL_ICMPV6 58 #define SOL_SCTP 132 #define SOL_UDPLITE 136 /* UDP-Lite (RFC 3828) */ #define SOL_RAW 255 #define SOL_IPX 256 #define SOL_AX25 257 #define SOL_ATALK 258 #define SOL_NETROM 259 #define SOL_ROSE 260 #define SOL_DECNET 261 #define SOL_X25 262 #define SOL_PACKET 263 #define SOL_ATM 264 /* ATM layer (cell level) */ #define SOL_AAL 265 /* ATM Adaption Layer (packet level) */ #define SOL_IRDA 266 #define SOL_NETBEUI 267 #define SOL_LLC 268 #define SOL_DCCP 269 #define SOL_NETLINK 270 #define SOL_TIPC 271 #define SOL_RXRPC 272 #define SOL_PPPOL2TP 273 #define SOL_BLUETOOTH 274 #define SOL_PNPIPE 275 #define SOL_RDS 276 #define SOL_IUCV 277 #define SOL_CAIF 278 #define SOL_ALG 279 #define SOL_NFC 280 #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 #define SOL_MPTCP 284 #define SOL_MCTP 285 #define SOL_SMC 286 #define SOL_VSOCK 287 /* IPX options */ #define IPX_TYPE 1 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); struct timespec64; struct __kernel_timespec; struct old_timespec32; struct scm_timestamping_internal { struct timespec64 ts[3]; }; extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); extern void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss); /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff * forbid_cmsg_compat==false */ extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct __kernel_timespec __user *timeout, struct old_timespec32 __user *timeout32); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags); extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags); extern int sendmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct iovec **iov); extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); extern int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *umsg, struct sockaddr __user **save_addr); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len); extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); extern struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); #endif /* _LINUX_SOCKET_H */ |
3125 3121 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | /* SPDX-License-Identifier: GPL-2.0 */ /* * memory buffer pool support */ #ifndef _LINUX_MEMPOOL_H #define _LINUX_MEMPOOL_H #include <linux/wait.h> #include <linux/compiler.h> struct kmem_cache; typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data); typedef void (mempool_free_t)(void *element, void *pool_data); typedef struct mempool_s { spinlock_t lock; int min_nr; /* nr of elements at *elements */ int curr_nr; /* Current nr of elements at *elements */ void **elements; void *pool_data; mempool_alloc_t *alloc; mempool_free_t *free; wait_queue_head_t wait; } mempool_t; static inline bool mempool_initialized(mempool_t *pool) { return pool->elements != NULL; } static inline bool mempool_is_saturated(mempool_t *pool) { return READ_ONCE(pool->curr_nr) >= pool->min_nr; } void mempool_exit(mempool_t *pool); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id); int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int nid); extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; extern void mempool_free(void *element, mempool_t *pool); /* * A mempool_alloc_t and mempool_free_t that get the memory from * a slab cache that is passed in through pool_data. * Note: the slab cache may not have a ctor function. */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); void mempool_free_slab(void *element, void *pool_data); static inline int mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc) { return mempool_init(pool, min_nr, mempool_alloc_slab, mempool_free_slab, (void *) kc); } static inline mempool_t * mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) { return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab, (void *) kc); } /* * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the * amount of memory specified by pool_data */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data); void mempool_kfree(void *element, void *pool_data); static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size) { return mempool_init(pool, min_nr, mempool_kmalloc, mempool_kfree, (void *) size); } static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) { return mempool_create(min_nr, mempool_kmalloc, mempool_kfree, (void *) size); } /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data */ void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data); void mempool_free_pages(void *element, void *pool_data); static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order) { return mempool_init(pool, min_nr, mempool_alloc_pages, mempool_free_pages, (void *)(long)order); } static inline mempool_t *mempool_create_page_pool(int min_nr, int order) { return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages, (void *)(long)order); } #endif /* _LINUX_MEMPOOL_H */ |
1 2308 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 | /* * net/tipc/monitor.c * * Copyright (c) 2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/genetlink.h> #include "core.h" #include "addr.h" #include "monitor.h" #include "bearer.h" #define MAX_MON_DOMAIN 64 #define MON_TIMEOUT 120000 #define MAX_PEER_DOWN_EVENTS 4 /* struct tipc_mon_domain: domain record to be transferred between peers * @len: actual size of domain record * @gen: current generation of sender's domain * @ack_gen: most recent generation of self's domain acked by peer * @member_cnt: number of domain member nodes described in this record * @up_map: bit map indicating which of the members the sender considers up * @members: identity of the domain members */ struct tipc_mon_domain { u16 len; u16 gen; u16 ack_gen; u16 member_cnt; u64 up_map; u32 members[MAX_MON_DOMAIN]; }; /* struct tipc_peer: state of a peer node and its domain * @addr: tipc node identity of peer * @head_map: shows which other nodes currently consider peer 'up' * @domain: most recent domain record from peer * @hash: position in hashed lookup list * @list: position in linked list, in circular ascending order by 'addr' * @applied: number of reported domain members applied on this monitor list * @is_up: peer is up as seen from this node * @is_head: peer is assigned domain head as seen from this node * @is_local: peer is in local domain and should be continuously monitored * @down_cnt: - numbers of other peers which have reported this on lost */ struct tipc_peer { u32 addr; struct tipc_mon_domain *domain; struct hlist_node hash; struct list_head list; u8 applied; u8 down_cnt; bool is_up; bool is_head; bool is_local; }; struct tipc_monitor { struct hlist_head peers[NODE_HTABLE_SIZE]; int peer_cnt; struct tipc_peer *self; rwlock_t lock; struct tipc_mon_domain cache; u16 list_gen; u16 dom_gen; struct net *net; struct timer_list timer; unsigned long timer_intv; }; static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) { return tipc_net(net)->monitors[bearer_id]; } const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); static inline u16 mon_cpu_to_le16(u16 val) { return (__force __u16)htons(val); } static inline u32 mon_cpu_to_le32(u32 val) { return (__force __u32)htonl(val); } static inline u64 mon_cpu_to_le64(u64 val) { return (__force __u64)cpu_to_be64(val); } static inline u16 mon_le16_to_cpu(u16 val) { return ntohs((__force __be16)val); } static inline u32 mon_le32_to_cpu(u32 val) { return ntohl((__force __be32)val); } static inline u64 mon_le64_to_cpu(u64 val) { return be64_to_cpu((__force __be64)val); } /* dom_rec_len(): actual length of domain record for transport */ static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) { return (offsetof(struct tipc_mon_domain, members)) + (mcnt * sizeof(u32)); } /* dom_size() : calculate size of own domain based on number of peers */ static int dom_size(int peers) { int i = 0; while ((i * i) < peers) i++; return i < MAX_MON_DOMAIN ? i : MAX_MON_DOMAIN; } static void map_set(u64 *up_map, int i, unsigned int v) { *up_map &= ~(1ULL << i); *up_map |= ((u64)v << i); } static int map_get(u64 up_map, int i) { return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) { return list_last_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_nxt(struct tipc_peer *peer) { return list_first_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_head(struct tipc_peer *peer) { while (!peer->is_head) peer = peer_prev(peer); return peer; } static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) { struct tipc_peer *peer; unsigned int thash = tipc_hashfn(addr); hlist_for_each_entry(peer, &mon->peers[thash], hash) { if (peer->addr == addr) return peer; } return NULL; } static struct tipc_peer *get_self(struct net *net, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); return mon->self; } static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) { struct tipc_net *tn = tipc_net(net); return mon->peer_cnt > tn->mon_threshold; } /* mon_identify_lost_members() : - identify amd mark potentially lost members */ static void mon_identify_lost_members(struct tipc_peer *peer, struct tipc_mon_domain *dom_bef, int applied_bef) { struct tipc_peer *member = peer; struct tipc_mon_domain *dom_aft = peer->domain; int applied_aft = peer->applied; int i; for (i = 0; i < applied_bef; i++) { member = peer_nxt(member); /* Do nothing if self or peer already see member as down */ if (!member->is_up || !map_get(dom_bef->up_map, i)) continue; /* Loss of local node must be detected by active probing */ if (member->is_local) continue; /* Start probing if member was removed from applied domain */ if (!applied_aft || (applied_aft < i)) { member->down_cnt = 1; continue; } /* Member loss is confirmed if it is still in applied domain */ if (!map_get(dom_aft->up_map, i)) member->down_cnt++; } } /* mon_apply_domain() : match a peer's domain record against monitor list */ static void mon_apply_domain(struct tipc_monitor *mon, struct tipc_peer *peer) { struct tipc_mon_domain *dom = peer->domain; struct tipc_peer *member; u32 addr; int i; if (!dom || !peer->is_up) return; /* Scan across domain members and match against monitor list */ peer->applied = 0; member = peer_nxt(peer); for (i = 0; i < dom->member_cnt; i++) { addr = dom->members[i]; if (addr != member->addr) return; peer->applied++; member = peer_nxt(member); } } /* mon_update_local_domain() : update after peer addition/removal/up/down */ static void mon_update_local_domain(struct tipc_monitor *mon) { struct tipc_peer *self = mon->self; struct tipc_mon_domain *cache = &mon->cache; struct tipc_mon_domain *dom = self->domain; struct tipc_peer *peer = self; u64 prev_up_map = dom->up_map; u16 member_cnt, i; bool diff; /* Update local domain size based on current size of cluster */ member_cnt = dom_size(mon->peer_cnt) - 1; self->applied = member_cnt; /* Update native and cached outgoing local domain records */ dom->len = dom_rec_len(dom, member_cnt); diff = dom->member_cnt != member_cnt; dom->member_cnt = member_cnt; for (i = 0; i < member_cnt; i++) { peer = peer_nxt(peer); diff |= dom->members[i] != peer->addr; dom->members[i] = peer->addr; map_set(&dom->up_map, i, peer->is_up); cache->members[i] = mon_cpu_to_le32(peer->addr); } diff |= dom->up_map != prev_up_map; if (!diff) return; dom->gen = ++mon->dom_gen; cache->len = mon_cpu_to_le16(dom->len); cache->gen = mon_cpu_to_le16(dom->gen); cache->member_cnt = mon_cpu_to_le16(member_cnt); cache->up_map = mon_cpu_to_le64(dom->up_map); mon_apply_domain(mon, self); } /* mon_update_neighbors() : update preceding neighbors of added/removed peer */ static void mon_update_neighbors(struct tipc_monitor *mon, struct tipc_peer *peer) { int dz, i; dz = dom_size(mon->peer_cnt); for (i = 0; i < dz; i++) { mon_apply_domain(mon, peer); peer = peer_prev(peer); } } /* mon_assign_roles() : reassign peer roles after a network change * The monitor list is consistent at this stage; i.e., each peer is monitoring * a set of domain members as matched between domain record and the monitor list */ static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) { struct tipc_peer *peer = peer_nxt(head); struct tipc_peer *self = mon->self; int i = 0; for (; peer != self; peer = peer_nxt(peer)) { peer->is_local = false; /* Update domain member */ if (i++ < head->applied) { peer->is_head = false; if (head == self) peer->is_local = true; continue; } /* Assign next domain head */ if (!peer->is_up) continue; if (peer->is_head) break; head = peer; head->is_head = true; i = 0; } mon->list_gen++; } void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *prev, *head; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) goto exit; prev = peer_prev(peer); list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); mon->peer_cnt--; head = peer_head(prev); if (head == self) mon_update_local_domain(mon); mon_update_neighbors(mon, prev); /* Revert to full-mesh monitoring if we reach threshold */ if (!tipc_mon_is_active(net, mon)) { list_for_each_entry(peer, &self->list, list) { kfree(peer->domain); peer->domain = NULL; peer->applied = 0; } } mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, struct tipc_peer **peer) { struct tipc_peer *self = mon->self; struct tipc_peer *cur, *prev, *p; p = kzalloc(sizeof(*p), GFP_ATOMIC); *peer = p; if (!p) return false; p->addr = addr; /* Add new peer to lookup list */ INIT_LIST_HEAD(&p->list); hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); /* Sort new peer into iterator list, in ascending circular order */ prev = self; list_for_each_entry(cur, &self->list, list) { if ((addr > prev->addr) && (addr < cur->addr)) break; if (((addr < cur->addr) || (addr > prev->addr)) && (prev->addr > cur->addr)) break; prev = cur; } list_add_tail(&p->list, &cur->list); mon->peer_cnt++; mon_update_neighbors(mon, p); return true; } void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self = get_self(net, bearer_id); struct tipc_peer *peer, *head; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) goto exit; peer->is_up = true; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *head; struct tipc_mon_domain *dom; int applied; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) { pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); goto exit; } applied = peer->applied; peer->applied = 0; dom = peer->domain; peer->domain = NULL; if (peer->is_head) mon_identify_lost_members(peer, dom, applied); kfree(dom); peer->is_up = false; peer->is_head = false; peer->is_local = false; peer->down_cnt = 0; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } /* tipc_mon_rcv - process monitor domain event message */ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *arrv_dom = data; struct tipc_mon_domain dom_bef; struct tipc_mon_domain *dom; struct tipc_peer *peer; u16 new_member_cnt = mon_le16_to_cpu(arrv_dom->member_cnt); int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); u16 new_gen = mon_le16_to_cpu(arrv_dom->gen); u16 acked_gen = mon_le16_to_cpu(arrv_dom->ack_gen); u16 arrv_dlen = mon_le16_to_cpu(arrv_dom->len); bool probing = state->probing; int i, applied_bef; state->probing = false; /* Sanity check received domain record */ if (new_member_cnt > MAX_MON_DOMAIN) return; if (dlen < dom_rec_len(arrv_dom, 0)) return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) return; if (dlen < new_dlen || arrv_dlen != new_dlen) return; /* Synch generation numbers with peer if link just came up */ if (!state->synched) { state->peer_gen = new_gen - 1; state->acked_gen = acked_gen; state->synched = true; } if (more(acked_gen, state->acked_gen)) state->acked_gen = acked_gen; /* Drop duplicate unless we are waiting for a probe response */ if (!more(new_gen, state->peer_gen) && !probing) return; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer || !peer->is_up) goto exit; /* Peer is confirmed, stop any ongoing probing */ peer->down_cnt = 0; /* Task is done for duplicate record */ if (!more(new_gen, state->peer_gen)) goto exit; state->peer_gen = new_gen; /* Cache current domain record for later use */ dom_bef.member_cnt = 0; dom = peer->domain; if (dom) memcpy(&dom_bef, dom, dom->len); /* Transform and store received domain record */ if (!dom || (dom->len < new_dlen)) { kfree(dom); dom = kmalloc(new_dlen, GFP_ATOMIC); peer->domain = dom; if (!dom) goto exit; } dom->len = new_dlen; dom->gen = new_gen; dom->member_cnt = new_member_cnt; dom->up_map = mon_le64_to_cpu(arrv_dom->up_map); for (i = 0; i < new_member_cnt; i++) dom->members[i] = mon_le32_to_cpu(arrv_dom->members[i]); /* Update peers affected by this domain record */ applied_bef = peer->applied; mon_apply_domain(mon, peer); mon_identify_lost_members(peer, &dom_bef, applied_bef); mon_assign_roles(mon, peer_head(peer)); exit: write_unlock_bh(&mon->lock); } void tipc_mon_prep(struct net *net, void *data, int *dlen, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *dom = data; u16 gen = mon->dom_gen; u16 len; /* Send invalid record if not active */ if (!tipc_mon_is_active(net, mon)) { dom->len = 0; return; } /* Send only a dummy record with ack if peer has acked our last sent */ if (likely(state->acked_gen == gen)) { len = dom_rec_len(dom, 0); *dlen = len; dom->len = mon_cpu_to_le16(len); dom->gen = mon_cpu_to_le16(gen); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); dom->member_cnt = 0; return; } /* Send the full record */ read_lock_bh(&mon->lock); len = mon_le16_to_cpu(mon->cache.len); *dlen = len; memcpy(data, &mon->cache, len); read_unlock_bh(&mon->lock); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); } void tipc_mon_get_state(struct net *net, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!tipc_mon_is_active(net, mon)) { state->probing = false; state->monitoring = true; return; } /* Used cached state if table has not changed */ if (!state->probing && (state->list_gen == mon->list_gen) && (state->acked_gen == mon->dom_gen)) return; read_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (peer) { state->probing = state->acked_gen != mon->dom_gen; state->probing |= peer->down_cnt; state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; state->monitoring = peer->is_local; state->monitoring |= peer->is_head; state->list_gen = mon->list_gen; } read_unlock_bh(&mon->lock); } static void mon_timeout(struct timer_list *t) { struct tipc_monitor *mon = from_timer(mon, t, timer); struct tipc_peer *self; int best_member_cnt = dom_size(mon->peer_cnt) - 1; write_lock_bh(&mon->lock); self = mon->self; if (self && (best_member_cnt != self->applied)) { mon_update_local_domain(mon); mon_assign_roles(mon, self); } write_unlock_bh(&mon->lock); mod_timer(&mon->timer, jiffies + mon->timer_intv); } int tipc_mon_create(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon; struct tipc_peer *self; struct tipc_mon_domain *dom; if (tn->monitors[bearer_id]) return 0; mon = kzalloc(sizeof(*mon), GFP_ATOMIC); self = kzalloc(sizeof(*self), GFP_ATOMIC); dom = kzalloc(sizeof(*dom), GFP_ATOMIC); if (!mon || !self || !dom) { kfree(mon); kfree(self); kfree(dom); return -ENOMEM; } tn->monitors[bearer_id] = mon; rwlock_init(&mon->lock); mon->net = net; mon->peer_cnt = 1; mon->self = self; self->domain = dom; self->addr = tipc_own_addr(net); self->is_up = true; self->is_head = true; INIT_LIST_HEAD(&self->list); timer_setup(&mon->timer, mon_timeout, 0); mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); mod_timer(&mon->timer, jiffies + mon->timer_intv); return 0; } void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *tmp; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); } mon->self = NULL; write_unlock_bh(&mon->lock); timer_shutdown_sync(&mon->timer); kfree(self->domain); kfree(self); kfree(mon); } void tipc_mon_reinit_self(struct net *net) { struct tipc_monitor *mon; int bearer_id; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { mon = tipc_monitor(net, bearer_id); if (!mon) continue; write_lock_bh(&mon->lock); mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); if (cluster_size > TIPC_CLUSTER_SIZE) return -EINVAL; tn->mon_threshold = cluster_size; return 0; } int tipc_nl_monitor_get_threshold(struct net *net) { struct tipc_net *tn = tipc_net(net); return tn->mon_threshold; } static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) { struct tipc_mon_domain *dom = peer->domain; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_PEER_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON_PEER); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied)) goto attr_msg_full; if (peer->is_up) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP)) goto attr_msg_full; if (peer->is_local) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL)) goto attr_msg_full; if (peer->is_head) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD)) goto attr_msg_full; if (dom) { if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP, dom->up_map, TIPC_NLA_MON_PEER_PAD)) goto attr_msg_full; if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS, dom->member_cnt * sizeof(u32), &dom->members)) goto attr_msg_full; } nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!mon) return -EINVAL; read_lock_bh(&mon->lock); peer = mon->self; do { if (*prev_node) { if (peer->addr == *prev_node) *prev_node = 0; else continue; } if (__tipc_nl_add_monitor_peer(peer, msg)) { *prev_node = peer->addr; read_unlock_bh(&mon->lock); return -EMSGSIZE; } } while ((peer = peer_nxt(peer)) != mon->self); read_unlock_bh(&mon->lock); return 0; } int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); char bearer_name[TIPC_MAX_BEARER_NAME]; struct nlattr *attrs; void *hdr; int ret; ret = tipc_bearer_get_name(net, bearer_name, bearer_id); if (ret || !mon) return 0; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; read_lock_bh(&mon->lock); if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id)) goto attr_msg_full; if (tipc_mon_is_active(net, mon)) if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE)) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen)) goto attr_msg_full; read_unlock_bh(&mon->lock); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: read_unlock_bh(&mon->lock); nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } |
8 8 8 8 2 2 2 2 5 5 5 5 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | // SPDX-License-Identifier: GPL-2.0-or-later /* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6. * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv4/xfrm4_protocol.c */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <net/ip6_route.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly; static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly; static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly; static DEFINE_MUTEX(xfrm6_protocol_mutex); static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol) { switch (protocol) { case IPPROTO_ESP: return &esp6_handlers; case IPPROTO_AH: return &ah6_handlers; case IPPROTO_COMP: return &ipcomp6_handlers; } return NULL; } #define for_each_protocol_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm6_protocol *handler; struct xfrm6_protocol __rcu **head = proto_handlers(protocol); if (!head) return 0; for_each_protocol_rcu(*proto_handlers(protocol), handler) if ((ret = handler->cb_handler(skb, err)) <= 0) return ret; return 0; } int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { int ret; struct xfrm6_protocol *handler; struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); if (!head) goto out; if (!skb_dst(skb)) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, skb, flags); if (dst->error) goto drop; skb_dst_set(skb, dst); } for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; out: icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm6_rcv_encap); static int xfrm6_esp_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(esp6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(esp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int xfrm6_ah_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(ah6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ah6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int xfrm6_ipcomp_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(ipcomp6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ipcomp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static const struct inet6_protocol esp6_protocol = { .handler = xfrm6_esp_rcv, .err_handler = xfrm6_esp_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol ah6_protocol = { .handler = xfrm6_ah_rcv, .err_handler = xfrm6_ah_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol ipcomp6_protocol = { .handler = xfrm6_ipcomp_rcv, .err_handler = xfrm6_ipcomp_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct xfrm_input_afinfo xfrm6_input_afinfo = { .family = AF_INET6, .callback = xfrm6_rcv_cb, }; static inline const struct inet6_protocol *netproto(unsigned char protocol) { switch (protocol) { case IPPROTO_ESP: return &esp6_protocol; case IPPROTO_AH: return &ah6_protocol; case IPPROTO_COMP: return &ipcomp6_protocol; } return NULL; } int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol) { struct xfrm6_protocol __rcu **pprev; struct xfrm6_protocol *t; bool add_netproto = false; int ret = -EEXIST; int priority = handler->priority; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm6_protocol_mutex); if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm6_protocol_mutex))) add_netproto = true; for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; pprev = &t->next) { if (t->priority < priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&xfrm6_protocol_mutex); if (add_netproto) { if (inet6_add_protocol(netproto(protocol), protocol)) { pr_err("%s: can't add protocol\n", __func__); ret = -EAGAIN; } } return ret; } EXPORT_SYMBOL(xfrm6_protocol_register); int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol) { struct xfrm6_protocol __rcu **pprev; struct xfrm6_protocol *t; int ret = -ENOENT; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm6_protocol_mutex); for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm6_protocol_mutex))) { if (inet6_del_protocol(netproto(protocol), protocol) < 0) { pr_err("%s: can't remove protocol\n", __func__); ret = -EAGAIN; } } mutex_unlock(&xfrm6_protocol_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm6_protocol_deregister); int __init xfrm6_protocol_init(void) { return xfrm_input_register_afinfo(&xfrm6_input_afinfo); } void xfrm6_protocol_fini(void) { xfrm_input_unregister_afinfo(&xfrm6_input_afinfo); } |
5927 14 1487 1428 2669 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMSTAT_H #define _LINUX_VMSTAT_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> #include <linux/static_key.h> #include <linux/mmdebug.h> extern int sysctl_stat_interval; #ifdef CONFIG_NUMA #define ENABLE_NUMA_STAT 1 #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { unsigned nr_dirty; unsigned nr_unqueued_dirty; unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; }; enum writeback_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, NR_VM_WRITEBACK_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. * * Counters should only be incremented and no critical kernel component * should rely on the counter values. * * Counters are handled completely inline. On many platforms the code * generated will simply be the increment of a global address. */ struct vm_event_state { unsigned long event[NR_VM_EVENT_ITEMS]; }; DECLARE_PER_CPU(struct vm_event_state, vm_event_states); /* * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the * local_irq_disable overhead. */ static inline void __count_vm_event(enum vm_event_item item) { raw_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { raw_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); extern void vm_events_fold_cpu(int cpu); #else /* Disable counters */ static inline void count_vm_event(enum vm_event_item item) { } static inline void count_vm_events(enum vm_event_item item, long delta) { } static inline void __count_vm_event(enum vm_event_item item) { } static inline void __count_vm_events(enum vm_event_item item, long delta) { } static inline void all_vm_events(unsigned long *ret) { } static inline void vm_events_fold_cpu(int cpu) { } #endif /* CONFIG_VM_EVENT_COUNTERS */ #ifdef CONFIG_NUMA_BALANCING #define count_vm_numa_event(x) count_vm_event(x) #define count_vm_numa_events(x, y) count_vm_events(x, y) #else #define count_vm_numa_event(x) do {} while (0) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_DEBUG_TLBFLUSH #define count_vm_tlb_event(x) count_vm_event(x) #define count_vm_tlb_events(x, y) count_vm_events(x, y) #else #define count_vm_tlb_event(x) do {} while (0) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif #ifdef CONFIG_PER_VMA_LOCK_STATS #define count_vm_vma_lock_event(x) count_vm_event(x) #else #define count_vm_vma_lock_event(x) do {} while (0) #endif #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) /* * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #ifdef CONFIG_NUMA static inline void zone_numa_event_add(long x, struct zone *zone, enum numa_stat_item item) { atomic_long_add(x, &zone->vm_numa_event[item]); atomic_long_add(x, &vm_numa_event[item]); } static inline unsigned long zone_numa_event_state(struct zone *zone, enum numa_stat_item item) { return atomic_long_read(&zone->vm_numa_event[item]); } static inline unsigned long global_numa_event_state(enum numa_stat_item item) { return atomic_long_read(&vm_numa_event[item]); } #endif /* CONFIG_NUMA */ static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &vm_zone_stat[item]); } static inline void node_page_state_add(long x, struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_add(x, &pgdat->vm_stat[item]); atomic_long_add(x, &vm_node_stat[item]); } static inline unsigned long global_zone_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_zone_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state_pages(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state(enum node_stat_item item) { VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); return global_node_page_state_pages(item); } static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } /* * More accurate version that also considers the currently pending * deltas. For that we need to loop over all cpus to find the current * deltas. There is no synchronization so the result cannot be * exactly accurate either. */ static inline unsigned long zone_page_state_snapshot(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; #endif return x; } #ifdef CONFIG_NUMA /* See __count_vm_event comment on why raw_cpu_inc is used. */ static inline void __count_numa_event(struct zone *zone, enum numa_stat_item item) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; raw_cpu_inc(pzstats->vm_numa_event[item]); } static inline void __count_numa_events(struct zone *zone, enum numa_stat_item item, long delta) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; raw_cpu_add(pzstats->vm_numa_event[item], delta); } extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); extern unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item); extern void fold_vm_numa_events(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #define node_page_state_pages(node, item) global_node_page_state_pages(item) static inline void fold_vm_numa_events(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); void __inc_node_page_state(struct page *, enum node_stat_item); void __dec_node_page_state(struct page *, enum node_stat_item); void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); void set_pgdat_percpu_threshold(pg_data_t *pgdat, int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* * We do not maintain differentials in a single processor configuration. * The functions directly modify the zone and global counters. */ static inline void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { zone_page_state_add(delta, zone, item); } static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { if (vmstat_item_in_bytes(item)) { /* * Only cgroups use subpage accounting right now; at * the global level, these items still change in * multiples of whole pages. Store them as pages * internally to keep the per-cpu counters compact. */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } node_page_state_add(delta, pgdat, item); } static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_inc(&zone->vm_stat[item]); atomic_long_inc(&vm_zone_stat[item]); } static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_inc(&pgdat->vm_stat[item]); atomic_long_inc(&vm_node_stat[item]); } static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_zone_stat[item]); } static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_dec(&pgdat->vm_stat[item]); atomic_long_dec(&vm_node_stat[item]); } static inline void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } static inline void __inc_node_page_state(struct page *page, enum node_stat_item item) { __inc_node_state(page_pgdat(page), item); } static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } static inline void __dec_node_page_state(struct page *page, enum node_stat_item item) { __dec_node_state(page_pgdat(page), item); } /* * We only use atomic operations to update counters. So there is no need to * disable interrupts. */ #define inc_zone_page_state __inc_zone_page_state #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state #define inc_node_page_state __inc_node_page_state #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state #define inc_zone_state __inc_zone_state #define inc_node_state __inc_node_state #define dec_zone_state __dec_zone_state #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ static inline void __zone_stat_mod_folio(struct folio *folio, enum zone_stat_item item, long nr) { __mod_zone_page_state(folio_zone(folio), item, nr); } static inline void __zone_stat_add_folio(struct folio *folio, enum zone_stat_item item) { __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); } static inline void __zone_stat_sub_folio(struct folio *folio, enum zone_stat_item item) { __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); } static inline void zone_stat_mod_folio(struct folio *folio, enum zone_stat_item item, long nr) { mod_zone_page_state(folio_zone(folio), item, nr); } static inline void zone_stat_add_folio(struct folio *folio, enum zone_stat_item item) { mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); } static inline void zone_stat_sub_folio(struct folio *folio, enum zone_stat_item item) { mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); } static inline void __node_stat_mod_folio(struct folio *folio, enum node_stat_item item, long nr) { __mod_node_page_state(folio_pgdat(folio), item, nr); } static inline void __node_stat_add_folio(struct folio *folio, enum node_stat_item item) { __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); } static inline void __node_stat_sub_folio(struct folio *folio, enum node_stat_item item) { __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } static inline void node_stat_mod_folio(struct folio *folio, enum node_stat_item item, long nr) { mod_node_page_state(folio_pgdat(folio), item, nr); } static inline void node_stat_add_folio(struct folio *folio, enum node_stat_item item) { mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); } static inline void node_stat_sub_folio(struct folio *folio, enum node_stat_item item) { mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) { return vmstat_text[item]; } #ifdef CONFIG_NUMA static inline const char *numa_stat_name(enum numa_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + item]; } #endif /* CONFIG_NUMA */ static inline const char *node_stat_name(enum node_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + item]; } static inline const char *lru_list_name(enum lru_list lru) { return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } static inline const char *writeback_stat_name(enum writeback_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + item]; } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + NR_VM_WRITEBACK_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ #ifdef CONFIG_MEMCG void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_state(lruvec, idx, val); local_irq_restore(flags); } void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __lruvec_stat_mod_folio(folio, idx, val); local_irq_restore(flags); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { lruvec_stat_mod_folio(page_folio(page), idx, val); } #else static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { __mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { mod_node_page_state(page_pgdat(page), idx, val); } #endif /* CONFIG_MEMCG */ static inline void __lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); } static inline void __lruvec_stat_sub_folio(struct folio *folio, enum node_stat_item idx) { __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); } static inline void lruvec_stat_sub_folio(struct folio *folio, enum node_stat_item idx) { lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } #endif /* _LINUX_VMSTAT_H */ |
7 7 7 7 7 33 7 7 33 33 29 27 27 7 29 29 29 29 66 67 67 67 66 67 2 27 67 7 7 7 7 21 21 21 21 21 1 2 1 1 6 1 3 3 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 37 37 36 35 37 2 3 4 4 3 3 3 6 6 5 1 6 5 2 2 2 2 2 2 1 1 1 1 1 2 2 2 1 66 1 65 64 32 69 70 70 69 68 15 19 19 4 3 2 1 72 73 72 67 67 41 6 5 4 2 3 4 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/key/af_key.c An implementation of PF_KEYv2 sockets. * * Authors: Maxim Giryaev <gem@asplinux.ru> * David S. Miller <davem@redhat.com> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * Kazunori MIYAZAWA / USAGI Project <miyazawa@linux-ipv6.org> * Derek Atkins <derek@ihtfp.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/socket.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/xfrm.h> #include <net/sock.h> #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) static unsigned int pfkey_net_id __read_mostly; struct netns_pfkey { /* List of all pfkey sockets. */ struct hlist_head table; atomic_t socks_nr; }; static DEFINE_MUTEX(pfkey_mutex); #define DUMMY_MARK 0 static const struct xfrm_mark dummy_mark = {0, 0}; struct pfkey_sock { /* struct sock must be the first member of struct pfkey_sock */ struct sock sk; int registered; int promisc; struct { uint8_t msg_version; uint32_t msg_portid; int (*dump)(struct pfkey_sock *sk); void (*done)(struct pfkey_sock *sk); union { struct xfrm_policy_walk policy; struct xfrm_state_walk state; } u; struct sk_buff *skb; } dump; struct mutex dump_lock; }; static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family); static inline struct pfkey_sock *pfkey_sk(struct sock *sk) { return (struct pfkey_sock *)sk; } static int pfkey_can_dump(const struct sock *sk) { if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf) return 1; return 0; } static void pfkey_terminate_dump(struct pfkey_sock *pfk) { if (pfk->dump.dump) { if (pfk->dump.skb) { kfree_skb(pfk->dump.skb); pfk->dump.skb = NULL; } pfk->dump.done(pfk); pfk->dump.dump = NULL; pfk->dump.done = NULL; } } static void pfkey_sock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_terminate_dump(pfkey_sk(sk)); skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive pfkey socket: %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); atomic_dec(&net_pfkey->socks_nr); } static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); mutex_lock(&pfkey_mutex); sk_add_node_rcu(sk, &net_pfkey->table); mutex_unlock(&pfkey_mutex); } static void pfkey_remove(struct sock *sk) { mutex_lock(&pfkey_mutex); sk_del_node_init_rcu(sk); mutex_unlock(&pfkey_mutex); } static struct proto key_proto = { .name = "KEY", .owner = THIS_MODULE, .obj_size = sizeof(struct pfkey_sock), }; static int pfkey_create(struct net *net, struct socket *sock, int protocol, int kern) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; struct pfkey_sock *pfk; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol != PF_KEY_V2) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) return -ENOMEM; pfk = pfkey_sk(sk); mutex_init(&pfk->dump_lock); sock->ops = &pfkey_ops; sock_init_data(sock, sk); sk->sk_family = PF_KEY; sk->sk_destruct = pfkey_sock_destruct; atomic_inc(&net_pfkey->socks_nr); pfkey_insert(sk); return 0; } static int pfkey_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; pfkey_remove(sk); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_write_queue); synchronize_rcu(); sock_put(sk); return 0; } static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation, struct sock *sk) { int err = -ENOBUFS; if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return err; skb = skb_clone(skb, allocation); if (skb) { skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); err = 0; } return err; } /* Send SKB to all pfkey sockets matching selected criteria. */ #define BROADCAST_ALL 0 #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; int err = -ESRCH; /* XXX Do we need something like netlink_overrun? I think * XXX PF_KEY socket apps will not mind current behavior. */ if (!skb) return -ENOMEM; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { struct pfkey_sock *pfk = pfkey_sk(sk); int err2; /* Yes, it means that if you are meant to receive this * pfkey message you receive it twice as promiscuous * socket. */ if (pfk->promisc) pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) continue; if (broadcast_flags != BROADCAST_ALL) { if (broadcast_flags & BROADCAST_PROMISC_ONLY) continue; if ((broadcast_flags & BROADCAST_REGISTERED) && !pfk->registered) continue; if (broadcast_flags & BROADCAST_ONE) continue; } err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* Error is cleared after successful sending to at least one * registered KM */ if ((broadcast_flags & BROADCAST_REGISTERED) && err) err = err2; } rcu_read_unlock(); if (one_sk != NULL) err = pfkey_broadcast_one(skb, allocation, one_sk); kfree_skb(skb); return err; } static int pfkey_do_dump(struct pfkey_sock *pfk) { struct sadb_msg *hdr; int rc; mutex_lock(&pfk->dump_lock); if (!pfk->dump.dump) { rc = 0; goto out; } rc = pfk->dump.dump(pfk); if (rc == -ENOBUFS) { rc = 0; goto out; } if (pfk->dump.skb) { if (!pfkey_can_dump(&pfk->sk)) { rc = 0; goto out; } hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } pfkey_terminate_dump(pfk); out: mutex_unlock(&pfk->dump_lock); return rc; } static inline void pfkey_hdr_dup(struct sadb_msg *new, const struct sadb_msg *orig) { *new = *orig; } static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) { struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); struct sadb_msg *hdr; if (!skb) return -ENOBUFS; /* Woe be to the platform trying to support PFKEY yet * having normal errnos outside the 1-255 range, inclusive. */ err = -err; if (err == ERESTARTSYS || err == ERESTARTNOHAND || err == ERESTARTNOINTR) err = EINTR; if (err >= 512) err = EINVAL; BUG_ON(err <= 0 || err >= 256); hdr = skb_put(skb, sizeof(struct sadb_msg)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = (uint8_t) err; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static const u8 sadb_ext_min_len[] = { [SADB_EXT_RESERVED] = (u8) 0, [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress), [SADB_X_EXT_FILTER] = (u8) sizeof(struct sadb_x_filter), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ static int verify_address_len(const void *p) { const struct sadb_address *sp = p; const struct sockaddr *addr = (const struct sockaddr *)(sp + 1); const struct sockaddr_in *sin; #if IS_ENABLED(CONFIG_IPV6) const struct sockaddr_in6 *sin6; #endif int len; if (sp->sadb_address_len < DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), sizeof(uint64_t))) return -EINVAL; switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 32) return -EINVAL; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 128) return -EINVAL; break; #endif default: /* It is user using kernel to keep track of security * associations for another protocol, such as * OSPF/RSVP/RIPV2/MIP. It is user's job to verify * lengths. * * XXX Actually, association/policy database is not yet * XXX able to cope with arbitrary sockaddr families. * XXX When it can, remove this -EINVAL. -DaveM */ return -EINVAL; } return 0; } static inline int sadb_key_len(const struct sadb_key *key) { int key_bytes = DIV_ROUND_UP(key->sadb_key_bits, 8); return DIV_ROUND_UP(sizeof(struct sadb_key) + key_bytes, sizeof(uint64_t)); } static int verify_key_len(const void *p) { const struct sadb_key *key = p; if (sadb_key_len(key) > key->sadb_key_len) return -EINVAL; return 0; } static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx) { return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) + sec_ctx->sadb_x_ctx_len, sizeof(uint64_t)); } static inline int verify_sec_ctx_len(const void *p) { const struct sadb_x_sec_ctx *sec_ctx = p; int len = sec_ctx->sadb_x_ctx_len; if (len > PAGE_SIZE) return -EINVAL; len = pfkey_sec_ctx_len(sec_ctx); if (sec_ctx->sadb_x_sec_len != len) return -EINVAL; return 0; } static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx, gfp_t gfp) { struct xfrm_user_sec_ctx *uctx = NULL; int ctx_size = sec_ctx->sadb_x_ctx_len; uctx = kmalloc((sizeof(*uctx)+ctx_size), gfp); if (!uctx) return NULL; uctx->len = pfkey_sec_ctx_len(sec_ctx); uctx->exttype = sec_ctx->sadb_x_sec_exttype; uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; uctx->ctx_len = sec_ctx->sadb_x_ctx_len; memcpy(uctx + 1, sec_ctx + 1, uctx->ctx_len); return uctx; } static int present_and_same_family(const struct sadb_address *src, const struct sadb_address *dst) { const struct sockaddr *s_addr, *d_addr; if (!src || !dst) return 0; s_addr = (const struct sockaddr *)(src + 1); d_addr = (const struct sockaddr *)(dst + 1); if (s_addr->sa_family != d_addr->sa_family) return 0; if (s_addr->sa_family != AF_INET #if IS_ENABLED(CONFIG_IPV6) && s_addr->sa_family != AF_INET6 #endif ) return 0; return 1; } static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs) { const char *p = (char *) hdr; int len = skb->len; len -= sizeof(*hdr); p += sizeof(*hdr); while (len > 0) { const struct sadb_ext *ehdr = (const struct sadb_ext *) p; uint16_t ext_type; int ext_len; if (len < sizeof(*ehdr)) return -EINVAL; ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; if (ext_len < sizeof(uint64_t) || ext_len > len || ext_type == SADB_EXT_RESERVED) return -EINVAL; if (ext_type <= SADB_EXT_MAX) { int min = (int) sadb_ext_min_len[ext_type]; if (ext_len < min) return -EINVAL; if (ext_hdrs[ext_type-1] != NULL) return -EINVAL; switch (ext_type) { case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_X_EXT_NAT_T_OA: if (verify_address_len(p)) return -EINVAL; break; case SADB_X_EXT_SEC_CTX: if (verify_sec_ctx_len(p)) return -EINVAL; break; case SADB_EXT_KEY_AUTH: case SADB_EXT_KEY_ENCRYPT: if (verify_key_len(p)) return -EINVAL; break; default: break; } ext_hdrs[ext_type-1] = (void *) p; } p += ext_len; len -= ext_len; } return 0; } static uint16_t pfkey_satype2proto(uint8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: return IPSEC_PROTO_ANY; case SADB_SATYPE_AH: return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_COMP; default: return 0; } /* NOTREACHED */ } static uint8_t pfkey_proto2satype(uint16_t proto) { switch (proto) { case IPPROTO_AH: return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; case IPPROTO_COMP: return SADB_X_SATYPE_IPCOMP; default: return 0; } /* NOTREACHED */ } /* BTW, this scheme means that there is no way with PFKEY2 sockets to * say specifically 'just raw sockets' as we encode them as 255. */ static uint8_t pfkey_proto_to_xfrm(uint8_t proto) { return proto == IPSEC_PROTO_ANY ? 0 : proto; } static uint8_t pfkey_proto_from_xfrm(uint8_t proto) { return proto ? proto : IPSEC_PROTO_ANY; } static inline int pfkey_sockaddr_len(sa_family_t family) { switch (family) { case AF_INET: return sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return sizeof(struct sockaddr_in6); #endif } return 0; } static int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr) { switch (sa->sa_family) { case AF_INET: xaddr->a4 = ((struct sockaddr_in *)sa)->sin_addr.s_addr; return AF_INET; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: memcpy(xaddr->a6, &((struct sockaddr_in6 *)sa)->sin6_addr, sizeof(struct in6_addr)); return AF_INET6; #endif } return 0; } static int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr) { return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1), xaddr); } static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { const struct sadb_sa *sa; const struct sadb_address *addr; uint16_t proto; unsigned short family; xfrm_address_t *xaddr; sa = ext_hdrs[SADB_EXT_SA - 1]; if (sa == NULL) return NULL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return NULL; /* sadb_address_len should be checked by caller */ addr = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; if (addr == NULL) return NULL; family = ((const struct sockaddr *)(addr + 1))->sa_family; switch (family) { case AF_INET: xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr; break; #endif default: xaddr = NULL; } if (!xaddr) return NULL; return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family); } #define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) static int pfkey_sockaddr_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family)); } static inline int pfkey_mode_from_xfrm(int mode) { switch(mode) { case XFRM_MODE_TRANSPORT: return IPSEC_MODE_TRANSPORT; case XFRM_MODE_TUNNEL: return IPSEC_MODE_TUNNEL; case XFRM_MODE_BEET: return IPSEC_MODE_BEET; default: return -1; } } static inline int pfkey_mode_to_xfrm(int mode) { switch(mode) { case IPSEC_MODE_ANY: /*XXX*/ case IPSEC_MODE_TRANSPORT: return XFRM_MODE_TRANSPORT; case IPSEC_MODE_TUNNEL: return XFRM_MODE_TUNNEL; case IPSEC_MODE_BEET: return XFRM_MODE_BEET; default: return -1; } } static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port, struct sockaddr *sa, unsigned short family) { switch (family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; sin->sin_family = AF_INET; sin->sin_port = port; sin->sin_addr.s_addr = xaddr->a4; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return 32; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } #endif } return 0; } static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, int add_keys, int hsc) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_lifetime *lifetime; struct sadb_address *addr; struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int size; int auth_key_size = 0; int encrypt_key_size = 0; int sockaddr_size; struct xfrm_encap_tmpl *natt = NULL; int mode; /* address family check */ sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return ERR_PTR(-EINVAL); /* base, SA, (lifetime (HSC),) address(SD), (address(P),) key(AE), (identity(SD),) (sensitivity)> */ size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + sizeof(struct sadb_lifetime) + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } /* identity & sensitivity */ if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) size += sizeof(struct sadb_address) + sockaddr_size; if (add_keys) { if (x->aalg && x->aalg->alg_key_len) { auth_key_size = PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); size += sizeof(struct sadb_key) + auth_key_size; } if (x->ealg && x->ealg->alg_key_len) { encrypt_key_size = PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); size += sizeof(struct sadb_key) + encrypt_key_size; } } if (x->encap) natt = x->encap; if (natt && natt->encap_type) { size += sizeof(struct sadb_x_nat_t_type); size += sizeof(struct sadb_x_nat_t_port); size += sizeof(struct sadb_x_nat_t_port); } skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ hdr->sadb_msg_len = size / sizeof(uint64_t); /* sa */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = x->props.replay_window; switch (x->km.state) { case XFRM_STATE_VALID: sa->sadb_sa_state = x->km.dying ? SADB_SASTATE_DYING : SADB_SASTATE_MATURE; break; case XFRM_STATE_ACQ: sa->sadb_sa_state = SADB_SASTATE_LARVAL; break; default: sa->sadb_sa_state = SADB_SASTATE_DEAD; break; } sa->sadb_sa_auth = 0; if (x->aalg) { struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); sa->sadb_sa_auth = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_encrypt = 0; BUG_ON(x->ealg && x->calg); if (x->ealg) { struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ if (x->calg) { struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_flags = 0; if (x->props.flags & XFRM_STATE_NOECN) sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; if (x->props.flags & XFRM_STATE_DECAP_DSCP) sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; if (x->props.flags & XFRM_STATE_NOPMTUDISC) sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC; /* hard time */ if (hsc & 2) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; } /* soft time */ if (hsc & 1) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; } /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = x->curlft.packets; lifetime->sadb_lifetime_bytes = x->curlft.bytes; lifetime->sadb_lifetime_addtime = x->curlft.add_time; lifetime->sadb_lifetime_usetime = x->curlft.use_time; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; /* "if the ports are non-zero, then the sadb_address_proto field, normally zero, MUST be filled in with the transport protocol's number." - RFC2367 */ addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; addr->sadb_address_proto = pfkey_proto_from_xfrm(x->sel.proto); addr->sadb_address_prefixlen = x->sel.prefixlen_s; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport, (struct sockaddr *) (addr + 1), x->props.family); } /* auth key */ if (add_keys && auth_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + auth_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_AUTH; key->sadb_key_bits = x->aalg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); } /* encrypt key */ if (add_keys && encrypt_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + encrypt_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + encrypt_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; key->sadb_key_bits = x->ealg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->ealg->alg_key, (x->ealg->alg_key_len+7)/8); } /* sa */ sa2 = skb_put(skb, sizeof(struct sadb_x_sa2)); sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } sa2->sadb_x_sa2_mode = mode; sa2->sadb_x_sa2_reserved1 = 0; sa2->sadb_x_sa2_reserved2 = 0; sa2->sadb_x_sa2_sequence = 0; sa2->sadb_x_sa2_reqid = x->props.reqid; if (natt && natt->encap_type) { struct sadb_x_nat_t_type *n_type; struct sadb_x_nat_t_port *n_port; /* type */ n_type = skb_put(skb, sizeof(*n_type)); n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; n_type->sadb_x_nat_t_type_type = natt->encap_type; n_type->sadb_x_nat_t_type_reserved[0] = 0; n_type->sadb_x_nat_t_type_reserved[1] = 0; n_type->sadb_x_nat_t_type_reserved[2] = 0; /* source port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* dest port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = natt->encap_dport; n_port->sadb_x_nat_t_port_reserved = 0; } /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x) { struct sk_buff *skb; skb = __pfkey_xfrm_state2msg(x, 1, 3); return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x, int hsc) { return __pfkey_xfrm_state2msg(x, 0, hsc); } static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct xfrm_state *x; const struct sadb_lifetime *lifetime; const struct sadb_sa *sa; const struct sadb_key *key; const struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; sa = ext_hdrs[SADB_EXT_SA - 1]; if (!sa || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_AH && !ext_hdrs[SADB_EXT_KEY_AUTH-1]) return ERR_PTR(-EINVAL); if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) return ERR_PTR(-EINVAL); proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return ERR_PTR(-EINVAL); /* default error is no buffer space */ err = -ENOBUFS; /* RFC2367: Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. Therefore, the sadb_sa_state field of all submitted SAs MUST be SADB_SASTATE_MATURE and the kernel MUST return an error if this is not true. However, KAME setkey always uses SADB_SASTATE_LARVAL. Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. */ if (sa->sadb_sa_auth > SADB_AALG_MAX || (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || sa->sadb_sa_encrypt > SADB_EALG_MAX) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (key != NULL && sa->sadb_sa_auth != SADB_X_AALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key != NULL && sa->sadb_sa_encrypt != SADB_EALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); x = xfrm_state_alloc(net); if (x == NULL) return ERR_PTR(-ENOBUFS); x->id.proto = proto; x->id.spi = sa->sadb_sa_spi; x->props.replay_window = min_t(unsigned int, sa->sadb_sa_replay, (sizeof(x->replay.bitmap) * 8)); if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) x->props.flags |= XFRM_STATE_NOECN; if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) x->props.flags |= XFRM_STATE_DECAP_DSCP; if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC) x->props.flags |= XFRM_STATE_NOPMTUDISC; lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD - 1]; if (lifetime != NULL) { x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT - 1]; if (lifetime != NULL) { x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) goto out; err = security_xfrm_state_alloc(x, uctx); kfree(uctx); if (err) goto out; } err = -ENOBUFS; key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (sa->sadb_sa_auth) { int keysize = 0; struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } if (key) keysize = (key->sadb_key_bits + 7) / 8; x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); if (!x->aalg) { err = -ENOMEM; goto out; } strcpy(x->aalg->alg_name, a->name); x->aalg->alg_key_len = 0; if (key) { x->aalg->alg_key_len = key->sadb_key_bits; memcpy(x->aalg->alg_key, key+1, keysize); } x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits; x->props.aalgo = sa->sadb_sa_auth; /* x->algo.flags = sa->sadb_sa_flags; */ } if (sa->sadb_sa_encrypt) { if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); if (!x->calg) { err = -ENOMEM; goto out; } strcpy(x->calg->alg_name, a->name); x->props.calgo = sa->sadb_sa_encrypt; } else { int keysize = 0; struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key) keysize = (key->sadb_key_bits + 7) / 8; x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); if (!x->ealg) { err = -ENOMEM; goto out; } strcpy(x->ealg->alg_name, a->name); x->ealg->alg_key_len = 0; if (key) { x->ealg->alg_key_len = key->sadb_key_bits; memcpy(x->ealg->alg_key, key+1, keysize); } x->props.ealgo = sa->sadb_sa_encrypt; x->geniv = a->uinfo.encr.geniv; } } /* x->algo.flags = sa->sadb_sa_flags; */ x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], &x->props.saddr); pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], &x->id.daddr); if (ext_hdrs[SADB_X_EXT_SA2-1]) { const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1]; int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) { err = -EINVAL; goto out; } x->props.mode = mode; x->props.reqid = sa2->sadb_x_sa2_reqid; } if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; /* Nobody uses this, but we try. */ x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); x->sel.prefixlen_s = addr->sadb_address_prefixlen; } if (!x->sel.family) x->sel.family = x->props.family; if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; } natt = x->encap; n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; natt->encap_type = n_type->sadb_x_nat_t_type_type; if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; natt->encap_sport = n_port->sadb_x_nat_t_port_port; } if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } } err = xfrm_init_state(x); if (err) goto out; x->km.seq = hdr->sadb_msg_seq; return x; out: x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); return ERR_PTR(err); } static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -EOPNOTSUPP; } static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct sk_buff *resp_skb; struct sadb_x_sa2 *sa2; struct sadb_address *saddr, *daddr; struct sadb_msg *out_hdr; struct sadb_spirange *range; struct xfrm_state *x = NULL; int mode; int err; u32 min_spi, max_spi; u32 reqid; u8 proto; unsigned short family; xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) return -EINVAL; reqid = sa2->sadb_x_sa2_reqid; } else { mode = 0; reqid = 0; } saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; family = ((struct sockaddr *)(saddr + 1))->sa_family; switch (family) { case AF_INET: xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; break; #endif } if (hdr->sadb_msg_seq) { x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; } } if (!x) x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; min_spi = 0x100; max_spi = 0x0fffffff; range = ext_hdrs[SADB_EXT_SPIRANGE-1]; if (range) { min_spi = range->sadb_spirange_min; max_spi = range->sadb_spirange_max; } err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL); if (err) { xfrm_state_put(x); return err; } err = xfrm_alloc_spi(x, min_spi, max_spi, NULL); resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); if (IS_ERR(resp_skb)) { xfrm_state_put(x); return PTR_ERR(resp_skb); } out_hdr = (struct sadb_msg *) resp_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GETSPI; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; xfrm_state_put(x); pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) return -EOPNOTSUPP; if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); if (x == NULL) return 0; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_ACQ) x->km.state = XFRM_STATE_ERROR; spin_unlock_bh(&x->lock); xfrm_state_put(x); return 0; } static inline int event2poltype(int event) { switch (event) { case XFRM_MSG_DELPOLICY: return SADB_X_SPDDELETE; case XFRM_MSG_NEWPOLICY: return SADB_X_SPDADD; case XFRM_MSG_UPDPOLICY: return SADB_X_SPDUPDATE; case XFRM_MSG_POLEXPIRE: // return SADB_X_SPDEXPIRE; default: pr_err("pfkey: Unknown policy event %d\n", event); break; } return 0; } static inline int event2keytype(int event) { switch (event) { case XFRM_MSG_DELSA: return SADB_DELETE; case XFRM_MSG_NEWSA: return SADB_ADD; case XFRM_MSG_UPDSA: return SADB_UPDATE; case XFRM_MSG_EXPIRE: return SADB_EXPIRE; default: pr_err("pfkey: Unknown SA event %d\n", event); break; } return 0; } /* ADD/UPD/DEL */ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = pfkey_xfrm_state2msg(x); if (IS_ERR(skb)) return PTR_ERR(skb); hdr = (struct sadb_msg *) skb->data; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = event2keytype(c->event); hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; int err; struct km_event c; x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs); if (IS_ERR(x)) return PTR_ERR(x); xfrm_state_hold(x); if (hdr->sadb_msg_type == SADB_ADD) err = xfrm_state_add(x); else err = xfrm_state_update(x); xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); goto out; } if (hdr->sadb_msg_type == SADB_ADD) c.event = XFRM_MSG_NEWSA; else c.event = XFRM_MSG_UPDSA; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_state_notify(x, &c); out: xfrm_state_put(x); return err; } static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; struct km_event c; int err; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; if ((err = security_xfrm_state_delete(x))) goto out; if (xfrm_state_kern(x)) { err = -EPERM; goto out; } err = xfrm_state_delete(x); if (err < 0) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); out: xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; } static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); __u8 proto; struct sk_buff *out_skb; struct sadb_msg *out_hdr; struct xfrm_state *x; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; out_skb = pfkey_xfrm_state2msg(x); proto = x->id.proto; xfrm_state_put(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GET; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, gfp_t allocation) { struct sk_buff *skb; struct sadb_msg *hdr; int len, auth_len, enc_len, i; auth_len = xfrm_count_pfkey_auth_supported(); if (auth_len) { auth_len *= sizeof(struct sadb_alg); auth_len += sizeof(struct sadb_supported); } enc_len = xfrm_count_pfkey_enc_supported(); if (enc_len) { enc_len *= sizeof(struct sadb_alg); enc_len += sizeof(struct sadb_supported); } len = enc_len + auth_len + sizeof(struct sadb_msg); skb = alloc_skb(len + 16, allocation); if (!skb) goto out_put_algs; hdr = skb_put(skb, sizeof(*hdr)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = 0; hdr->sadb_msg_len = len / sizeof(uint64_t); if (auth_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, auth_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = auth_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; for (i = 0; ; i++) { struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg->available) *ap++ = aalg->desc; } } if (enc_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, enc_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = enc_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; for (i = 0; ; i++) { struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (ealg->available) *ap++ = ealg->desc; } } out_put_algs: return skb; } static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *supp_skb; if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) return -EINVAL; if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { if (pfk->registered&(1<<hdr->sadb_msg_satype)) return -EEXIST; pfk->registered |= (1<<hdr->sadb_msg_satype); } mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); mutex_unlock(&pfkey_mutex); if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<<hdr->sadb_msg_satype); return -ENOBUFS; } pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); return 0; } static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put_data(skb, ihdr, sizeof(struct sadb_msg)); hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto); hdr->sadb_msg_type = SADB_FLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int proto; struct km_event c; int err, err2; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; err = xfrm_state_flush(net, proto, true, false); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ err = 0; return err ? err : err2; } c.data.proto = proto; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_FLUSHSA; c.net = net; km_state_notify(NULL, &c); return 0; } static int dump_sa(struct xfrm_state *x, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_state2msg(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_DUMP; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sa(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk); } static void pfkey_dump_sa_done(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); xfrm_state_walk_done(&pfk->dump.u.state, net); } static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { u8 proto; struct xfrm_address_filter *filter = NULL; struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; if ((xfilter->sadb_x_filter_splen > (sizeof(xfrm_address_t) << 3)) || (xfilter->sadb_x_filter_dplen > (sizeof(xfrm_address_t) << 3))) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } filter = kmalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) { mutex_unlock(&pfk->dump_lock); return -ENOMEM; } memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, sizeof(xfrm_address_t)); memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr, sizeof(xfrm_address_t)); filter->family = xfilter->sadb_x_filter_family; filter->splen = xfilter->sadb_x_filter_splen; filter->dplen = xfilter->sadb_x_filter_dplen; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sa; pfk->dump.done = pfkey_dump_sa_done; xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); int satype = hdr->sadb_msg_satype; bool reset_errno = false; if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { reset_errno = true; if (satype != 0 && satype != 1) return -EINVAL; pfk->promisc = satype; } if (reset_errno && skb_cloned(skb)) skb = skb_copy(skb, GFP_KERNEL); else skb = skb_clone(skb, GFP_KERNEL); if (reset_errno && skb) { struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data; new_hdr->sadb_msg_errno = 0; } pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) { int i; u32 reqid = *(u32*)ptr; for (i=0; i<xp->xfrm_nr; i++) { if (xp->xfrm_vec[i].reqid == reqid) return -EEXIST; } return 0; } static u32 gen_reqid(struct net *net) { struct xfrm_policy_walk walk; u32 start; int rc; static u32 reqid = IPSEC_MANUAL_REQID_MAX; start = reqid; do { ++reqid; if (reqid == 0) reqid = IPSEC_MANUAL_REQID_MAX+1; xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN); rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid); xfrm_policy_walk_done(&walk, net); if (rc != -EEXIST) return reqid; } while (reqid != start); return 0; } static int parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol, struct sadb_x_ipsecrequest *rq) { struct net *net = xp_net(xp); struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; int mode; if (xp->xfrm_nr >= XFRM_MAX_DEPTH) return -ELOOP; if (rq->sadb_x_ipsecrequest_mode == 0) return -EINVAL; if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto)) return -EINVAL; t->id.proto = rq->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) { if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) && pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND) return -EINVAL; t->optional = 1; } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { t->reqid = rq->sadb_x_ipsecrequest_reqid; if (t->reqid > IPSEC_MANUAL_REQID_MAX) t->reqid = 0; if (!t->reqid && !(t->reqid = gen_reqid(net))) return -ENOBUFS; } /* addresses present only in tunnel mode */ if (t->mode == XFRM_MODE_TUNNEL) { int err; err = parse_sockaddr_pair( (struct sockaddr *)(rq + 1), rq->sadb_x_ipsecrequest_len - sizeof(*rq), &t->saddr, &t->id.daddr, &t->encap_family); if (err) return err; } else t->encap_family = xp->family; /* No way to set this via kame pfkey */ t->allalgs = 1; xp->xfrm_nr++; return 0; } static int parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) { int err; int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); struct sadb_x_ipsecrequest *rq = (void*)(pol+1); if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) return -EINVAL; while (len >= sizeof(*rq)) { if (len < rq->sadb_x_ipsecrequest_len || rq->sadb_x_ipsecrequest_len < sizeof(*rq)) return -EINVAL; if ((err = parse_ipsecrequest(xp, pol, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); } return 0; } static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); len += xfrm_ctx->ctx_len; return PFKEY_ALIGN8(len); } return 0; } static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp) { const struct xfrm_tmpl *t; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = 0; int i; for (i=0; i<xp->xfrm_nr; i++) { t = xp->xfrm_vec + i; socklen += pfkey_sockaddr_len(t->encap_family); } return sizeof(struct sadb_msg) + (sizeof(struct sadb_lifetime) * 3) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) + (socklen * 2) + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp) { struct sk_buff *skb; int size; size = pfkey_xfrm_policy2msg_size(xp); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); return skb; } static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir) { struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int i; int size; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = pfkey_sockaddr_len(xp->family); size = pfkey_xfrm_policy2msg_size(xp); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_s; addr->sadb_address_reserved = 0; if (!pfkey_sockaddr_fill(&xp->selector.saddr, xp->selector.sport, (struct sockaddr *) (addr + 1), xp->family)) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_d; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport, (struct sockaddr *) (addr + 1), xp->family); /* hard time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; /* soft time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = xp->curlft.packets; lifetime->sadb_lifetime_bytes = xp->curlft.bytes; lifetime->sadb_lifetime_addtime = xp->curlft.add_time; lifetime->sadb_lifetime_usetime = xp->curlft.use_time; pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; if (xp->action == XFRM_POLICY_ALLOW) { if (xp->xfrm_nr) pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; else pol->sadb_x_policy_type = IPSEC_POLICY_NONE; } pol->sadb_x_policy_dir = dir+1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; for (i=0; i<xp->xfrm_nr; i++) { const struct xfrm_tmpl *t = xp->xfrm_vec + i; struct sadb_x_ipsecrequest *rq; int req_size; int mode; req_size = sizeof(struct sadb_x_ipsecrequest); if (t->mode == XFRM_MODE_TUNNEL) { socklen = pfkey_sockaddr_len(t->encap_family); req_size += socklen * 2; } else { size -= 2*socklen; } rq = skb_put(skb, req_size); pol->sadb_x_policy_len += req_size/8; memset(rq, 0, sizeof(*rq)); rq->sadb_x_ipsecrequest_len = req_size; rq->sadb_x_ipsecrequest_proto = t->id.proto; if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0) return -EINVAL; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; if (t->reqid) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; if (t->optional) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; rq->sadb_x_ipsecrequest_reqid = t->reqid; if (t->mode == XFRM_MODE_TUNNEL) { u8 *sa = (void *)(rq + 1); pfkey_sockaddr_fill(&t->saddr, 0, (struct sockaddr *)sa, t->encap_family); pfkey_sockaddr_fill(&t->id.daddr, 0, (struct sockaddr *) (sa + socklen), t->encap_family); } } /* security context */ if ((xfrm_ctx = xp->security)) { int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); sec_ctx = skb_put(skb, ctx_size); sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = refcount_read(&xp->refcnt); return 0; } static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; if (c->data.byid && c->event == XFRM_MSG_DELPOLICY) out_hdr->sadb_msg_type = SADB_X_SPDDELETE2; else out_hdr->sadb_msg_type = event2poltype(c->event); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) return -EINVAL; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; xp = xfrm_policy_alloc(net, GFP_KERNEL); if (xp == NULL) return -ENOBUFS; xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->priority = pol->sadb_x_policy_priority; sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); xp->selector.family = xp->family; xp->selector.prefixlen_s = sa->sadb_address_prefixlen; xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.sport) xp->selector.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); xp->selector.prefixlen_d = sa->sadb_address_prefixlen; /* Amusing, we set this twice. KAME apps appear to set same value * in both addresses. */ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.dport) xp->selector.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) { err = -ENOBUFS; goto out; } err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL); kfree(uctx); if (err) goto out; } xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (err = parse_ipsecrequests(xp, pol)) < 0) goto out; err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; else c.event = XFRM_MSG_NEWPOLICY; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); xfrm_pol_put(xp); return 0; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return err; } static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct xfrm_selector sel; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *pol_ctx = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; memset(&sel, 0, sizeof(sel)); sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) return -ENOMEM; err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL); kfree(uctx); if (err) return err; } xp = xfrm_policy_bysel_ctx(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); if (xp == NULL) return -ENOENT; xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 0; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); out: xfrm_pol_put(xp); return err; } static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir) { int err; struct sk_buff *out_skb; struct sadb_msg *out_hdr; err = 0; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) { err = PTR_ERR(out_skb); goto out; } err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); goto out; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = hdr->sadb_msg_type; out_hdr->sadb_msg_satype = 0; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: return err; } static int pfkey_sockaddr_pair_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); } static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family) { int af, socklen; if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) return -EINVAL; af = pfkey_sockaddr_extract(sa, saddr); if (!af) return -EINVAL; socklen = pfkey_sockaddr_len(af); if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen), daddr) != af) return -EINVAL; *family = af; return 0; } #ifdef CONFIG_NET_KEY_MIGRATE static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct xfrm_migrate *m) { int err; struct sadb_x_ipsecrequest *rq2; int mode; if (len < sizeof(*rq1) || len < rq1->sadb_x_ipsecrequest_len || rq1->sadb_x_ipsecrequest_len < sizeof(*rq1)) return -EINVAL; /* old endoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), rq1->sadb_x_ipsecrequest_len - sizeof(*rq1), &m->old_saddr, &m->old_daddr, &m->old_family); if (err) return err; rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); len -= rq1->sadb_x_ipsecrequest_len; if (len <= sizeof(*rq2) || len < rq2->sadb_x_ipsecrequest_len || rq2->sadb_x_ipsecrequest_len < sizeof(*rq2)) return -EINVAL; /* new endpoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), rq2->sadb_x_ipsecrequest_len - sizeof(*rq2), &m->new_saddr, &m->new_daddr, &m->new_family); if (err) return err; if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto || rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode || rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid) return -EINVAL; m->proto = rq1->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; m->mode = mode; m->reqid = rq1->sadb_x_ipsecrequest_reqid; return ((int)(rq1->sadb_x_ipsecrequest_len + rq2->sadb_x_ipsecrequest_len)); } static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { int i, len, ret, err = -EINVAL; u8 dir; struct sadb_address *sa; struct sadb_x_kmaddress *kma; struct sadb_x_policy *pol; struct sadb_x_ipsecrequest *rq; struct xfrm_selector sel; struct xfrm_migrate m[XFRM_MAX_DEPTH]; struct xfrm_kmaddress k; struct net *net = sock_net(sk); if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1], ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) || !ext_hdrs[SADB_X_EXT_POLICY - 1]) { err = -EINVAL; goto out; } kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1]; pol = ext_hdrs[SADB_X_EXT_POLICY - 1]; if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) { err = -EINVAL; goto out; } if (kma) { /* convert sadb_x_kmaddress to xfrm_kmaddress */ k.reserved = kma->sadb_x_kmaddress_reserved; ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1), 8*(kma->sadb_x_kmaddress_len) - sizeof(*kma), &k.local, &k.remote, &k.family); if (ret < 0) { err = ret; goto out; } } dir = pol->sadb_x_policy_dir - 1; memset(&sel, 0, sizeof(sel)); /* set source address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); /* set destination address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); rq = (struct sadb_x_ipsecrequest *)(pol + 1); /* extract ipsecrequests */ i = 0; len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy); while (len > 0 && i < XFRM_MAX_DEPTH) { ret = ipsecrequests_to_migrate(rq, len, &m[i]); if (ret < 0) { err = ret; goto out; } else { rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret); len -= ret; i++; } } if (!i || len > 0) { err = -EINVAL; goto out; } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, kma ? &k : NULL, net, NULL, 0, NULL); out: return err; } #else static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -ENOPROTOOPT; } #endif static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int dir; int err = 0, delete; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) return -EINVAL; dir = xfrm_policy_id2dir(pol->sadb_x_policy_id); if (dir >= XFRM_POLICY_MAX) return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); xp = xfrm_policy_byid(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; if (delete) { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 1; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, dir, &c); } else { err = key_pol_get_resp(sk, xp, hdr, dir); } out: xfrm_pol_put(xp); return err; } static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_X_SPDDUMP; out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sp(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk); } static void pfkey_dump_sp_done(struct pfkey_sock *pfk) { struct net *net = sock_net((struct sock *)pfk); xfrm_policy_walk_done(&pfk->dump.u.policy, net); } static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sp; pfk->dump.done = pfkey_dump_sp_done; xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int key_notify_policy_flush(const struct km_event *c) { struct sk_buff *skb_out; struct sadb_msg *hdr; skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb_out) return -ENOBUFS; hdr = skb_put(skb_out, sizeof(struct sadb_msg)); hdr->sadb_msg_type = SADB_X_SPDFLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct km_event c; int err, err2; err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ return 0; return err; } c.data.type = XFRM_POLICY_TYPE_MAIN; c.event = XFRM_MSG_FLUSHPOLICY; c.portid = hdr->sadb_msg_pid; c.seq = hdr->sadb_msg_seq; c.net = net; km_policy_notify(NULL, 0, &c); return 0; } typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs); static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = { [SADB_RESERVED] = pfkey_reserved, [SADB_GETSPI] = pfkey_getspi, [SADB_UPDATE] = pfkey_add, [SADB_ADD] = pfkey_add, [SADB_DELETE] = pfkey_delete, [SADB_GET] = pfkey_get, [SADB_ACQUIRE] = pfkey_acquire, [SADB_REGISTER] = pfkey_register, [SADB_EXPIRE] = NULL, [SADB_FLUSH] = pfkey_flush, [SADB_DUMP] = pfkey_dump, [SADB_X_PROMISC] = pfkey_promisc, [SADB_X_PCHANGE] = NULL, [SADB_X_SPDUPDATE] = pfkey_spdadd, [SADB_X_SPDADD] = pfkey_spdadd, [SADB_X_SPDDELETE] = pfkey_spddelete, [SADB_X_SPDGET] = pfkey_spdget, [SADB_X_SPDACQUIRE] = NULL, [SADB_X_SPDDUMP] = pfkey_spddump, [SADB_X_SPDFLUSH] = pfkey_spdflush, [SADB_X_SPDSETIDX] = pfkey_spdadd, [SADB_X_SPDDELETE2] = pfkey_spdget, [SADB_X_MIGRATE] = pfkey_migrate, }; static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr) { void *ext_hdrs[SADB_EXT_MAX]; int err; /* Non-zero return value of pfkey_broadcast() does not always signal * an error and even on an actual error we may still want to process * the message so rather ignore the return value. */ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); err = parse_exthdrs(skb, hdr, ext_hdrs); if (!err) { err = -EOPNOTSUPP; if (pfkey_funcs[hdr->sadb_msg_type]) err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); } return err; } static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) { struct sadb_msg *hdr = NULL; if (skb->len < sizeof(*hdr)) { *errp = -EMSGSIZE; } else { hdr = (struct sadb_msg *) skb->data; if (hdr->sadb_msg_version != PF_KEY_V2 || hdr->sadb_msg_reserved != 0 || (hdr->sadb_msg_type <= SADB_RESERVED || hdr->sadb_msg_type > SADB_MAX)) { hdr = NULL; *errp = -EINVAL; } else if (hdr->sadb_msg_len != (skb->len / sizeof(uint64_t)) || hdr->sadb_msg_len < (sizeof(struct sadb_msg) / sizeof(uint64_t))) { hdr = NULL; *errp = -EMSGSIZE; } else { *errp = 0; } } return hdr; } static inline int aalg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->aalgos) * 8) return 0; return (t->aalgos >> id) & 1; } static inline int ealg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->ealgos) * 8) return 0; return (t->ealgos >> id) & 1; } static int count_ah_combs(const struct xfrm_tmpl *t) { int i, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); } static int count_esp_combs(const struct xfrm_tmpl *t) { int i, k, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg) && aalg->available) { struct sadb_comb *c; c = skb_put_zero(skb, sizeof(struct sadb_comb)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i=0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; for (k = 1; ; k++) { struct sadb_comb *c; const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (!(aalg_tmpl_set(t, aalg) && aalg->available)) continue; c = skb_put(skb, sizeof(struct sadb_comb)); memset(c, 0, sizeof(*c)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) { return 0; } static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int hard; int hsc; hard = c->data.hard; if (hard) hsc = 2; else hsc = 1; out_skb = pfkey_xfrm_state2msg_expire(x, hsc); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; out_hdr->sadb_msg_type = SADB_EXPIRE; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); return 0; } static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c) { struct net *net = x ? xs_net(x) : c->net; struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); if (atomic_read(&net_pfkey->socks_nr) == 0) return 0; switch (c->event) { case XFRM_MSG_EXPIRE: return key_notify_sa_expire(x, c); case XFRM_MSG_DELSA: case XFRM_MSG_NEWSA: case XFRM_MSG_UPDSA: return key_notify_sa(x, c); case XFRM_MSG_FLUSHSA: return key_notify_sa_flush(c); case XFRM_MSG_NEWAE: /* not yet supported */ break; default: pr_err("pfkey: Unknown SA event %d\n", c->event); break; } return 0; } static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { if (xp && xp->type != XFRM_POLICY_TYPE_MAIN) return 0; switch (c->event) { case XFRM_MSG_POLEXPIRE: return key_notify_policy_expire(xp, c); case XFRM_MSG_DELPOLICY: case XFRM_MSG_NEWPOLICY: case XFRM_MSG_UPDPOLICY: return key_notify_policy(xp, dir, c); case XFRM_MSG_FLUSHPOLICY: if (c->data.type != XFRM_POLICY_TYPE_MAIN) break; return key_notify_policy_flush(c); default: pr_err("pfkey: Unknown policy event %d\n", c->event); break; } return 0; } static u32 get_acqseq(void) { u32 res; static atomic_t acqseq; do { res = atomic_inc_return(&acqseq); } while (!res); return res; } static bool pfkey_is_alive(const struct km_event *c) { struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id); struct sock *sk; bool is_alive = false; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { if (pfkey_sk(sk)->registered) { is_alive = true; break; } } rcu_read_unlock(); return is_alive; } static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_x_policy *pol; int sockaddr_size; int size; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; size = sizeof(struct sadb_msg) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_ACQUIRE; hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq = get_acqseq(); hdr->sadb_msg_pid = 0; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = XFRM_POLICY_OUT + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ alg_size = 0; if (x->id.proto == IPPROTO_AH) alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) alg_size = dump_esp_combs(skb, t); hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, u8 *data, int len, int *dir) { struct net *net = sock_net(sk); struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; struct sadb_x_sec_ctx *sec_ctx; switch (sk->sk_family) { case AF_INET: if (opt != IP_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (opt != IPV6_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #endif default: *dir = -EINVAL; return NULL; } *dir = -EINVAL; if (len < sizeof(struct sadb_x_policy) || pol->sadb_x_policy_len*8 > len || pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) return NULL; xp = xfrm_policy_alloc(net, GFP_ATOMIC); if (xp == NULL) { *dir = -ENOBUFS; return NULL; } xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; xp->family = sk->sk_family; xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; /* security context too */ if (len >= (pol->sadb_x_policy_len*8 + sizeof(struct sadb_x_sec_ctx))) { char *p = (char *)pol; struct xfrm_user_sec_ctx *uctx; p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } if ((*dir = verify_sec_ctx_len(p))) goto out; uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC); *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC); kfree(uctx); if (*dir) goto out; } *dir = pol->sadb_x_policy_dir-1; return xp; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return NULL; } static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_address *addr; struct sadb_x_nat_t_port *n_port; int sockaddr_size; int size; __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); struct xfrm_encap_tmpl *natt = NULL; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; if (!satype) return -EINVAL; if (!x->encap) return -EINVAL; natt = x->encap; /* Build an SADB_X_NAT_T_NEW_MAPPING message: * * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) */ size = sizeof(struct sadb_msg) + sizeof(struct sadb_sa) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + (sizeof(struct sadb_x_nat_t_port) * 2); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; hdr->sadb_msg_satype = satype; hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = 0; sa->sadb_sa_state = 0; sa->sadb_sa_auth = 0; sa->sadb_sa_encrypt = 0; sa->sadb_sa_flags = 0; /* ADDRESS_SRC (old addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_SPORT (old port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* ADDRESS_DST (new addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(ipaddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_DPORT (new port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE static int set_sadb_address(struct sk_buff *skb, int sasize, int type, const struct xfrm_selector *sel) { struct sadb_address *addr; addr = skb_put(skb, sizeof(struct sadb_address) + sasize); addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8; addr->sadb_address_exttype = type; addr->sadb_address_proto = sel->proto; addr->sadb_address_reserved = 0; switch (type) { case SADB_EXT_ADDRESS_SRC: addr->sadb_address_prefixlen = sel->prefixlen_s; pfkey_sockaddr_fill(&sel->saddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; case SADB_EXT_ADDRESS_DST: addr->sadb_address_prefixlen = sel->prefixlen_d; pfkey_sockaddr_fill(&sel->daddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; default: return -EINVAL; } return 0; } static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k) { struct sadb_x_kmaddress *kma; u8 *sa; int family = k->family; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = (sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(family)); kma = skb_put_zero(skb, size_req); kma->sadb_x_kmaddress_len = size_req / 8; kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS; kma->sadb_x_kmaddress_reserved = k->reserved; sa = (u8 *)(kma + 1); if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family)) return -EINVAL; return 0; } static int set_ipsecrequest(struct sk_buff *skb, uint8_t proto, uint8_t mode, int level, uint32_t reqid, uint8_t family, const xfrm_address_t *src, const xfrm_address_t *dst) { struct sadb_x_ipsecrequest *rq; u8 *sa; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(family); rq = skb_put_zero(skb, size_req); rq->sadb_x_ipsecrequest_len = size_req; rq->sadb_x_ipsecrequest_proto = proto; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = level; rq->sadb_x_ipsecrequest_reqid = reqid; sa = (u8 *) (rq + 1); if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family)) return -EINVAL; return 0; } #endif #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { int i; int sasize_sel; int size = 0; int size_pol = 0; struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_x_policy *pol; const struct xfrm_migrate *mp; if (type != XFRM_POLICY_TYPE_MAIN) return 0; if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH) return -EINVAL; if (k != NULL) { /* addresses for KM */ size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(k->family)); } /* selector */ sasize_sel = pfkey_sockaddr_size(sel->family); if (!sasize_sel) return -EINVAL; size += (sizeof(struct sadb_address) + sasize_sel) * 2; /* policy info */ size_pol += sizeof(struct sadb_x_policy); /* ipsecrequests */ for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->old_family); /* new locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->new_family); } size += sizeof(struct sadb_msg) + size_pol; /* alloc buffer */ skb = alloc_skb(size, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_MIGRATE; hdr->sadb_msg_satype = pfkey_proto2satype(m->proto); hdr->sadb_msg_len = size / 8; hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = 0; hdr->sadb_msg_pid = 0; /* Addresses to be used by KM for negotiation, if ext is available */ if (k != NULL && (set_sadb_kmaddress(skb, k) < 0)) goto err; /* selector src */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel); /* selector dst */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel); /* policy information */ pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = size_pol / 8; pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = dir + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = 0; pol->sadb_x_policy_priority = 0; for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old ipsecrequest */ int mode = pfkey_mode_from_xfrm(mp->mode); if (mode < 0) goto err; if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->old_family, &mp->old_saddr, &mp->old_daddr) < 0) goto err; /* new ipsecrequest */ if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->new_family, &mp->new_saddr, &mp->new_daddr) < 0) goto err; } /* broadcast migrate message to sockets */ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; err: kfree_skb(skb); return -EINVAL; } #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; } #endif static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sadb_msg *hdr = NULL; int err; struct net *net = sock_net(sk); err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) goto out; err = -EMSGSIZE; if ((unsigned int)len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = alloc_skb(len, GFP_KERNEL); if (skb == NULL) goto out; err = -EFAULT; if (memcpy_from_msg(skb_put(skb,len), msg, len)) goto out; hdr = pfkey_get_base_msg(skb, &err); if (!hdr) goto out; mutex_lock(&net->xfrm.xfrm_cfg_mutex); err = pfkey_process(sk, skb, hdr); mutex_unlock(&net->xfrm.xfrm_cfg_mutex); out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) err = 0; kfree_skb(skb); return err ? : len; } static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *skb; int copied, err; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) goto out; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; sock_recv_cmsgs(msg, sk, skb); err = (flags & MSG_TRUNC) ? skb->len : copied; if (pfk->dump.dump != NULL && 3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) pfkey_do_dump(pfk); out_free: skb_free_datagram(sk, skb); out: return err; } static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, /* Now the operations that really occur. */ .release = pfkey_release, .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; static const struct net_proto_family pfkey_family_ops = { .family = PF_KEY, .create = pfkey_create, .owner = THIS_MODULE, }; #ifdef CONFIG_PROC_FS static int pfkey_seq_show(struct seq_file *f, void *v) { struct sock *s = sk_entry(v); if (v == SEQ_START_TOKEN) seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n"); else seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), from_kuid_munged(seq_user_ns(f), sock_i_uid(s)), sock_i_ino(s) ); return 0; } static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos) __acquires(rcu) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); rcu_read_lock(); return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos); } static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); return seq_hlist_next_rcu(v, &net_pfkey->table, ppos); } static void pfkey_seq_stop(struct seq_file *f, void *v) __releases(rcu) { rcu_read_unlock(); } static const struct seq_operations pfkey_seq_ops = { .start = pfkey_seq_start, .next = pfkey_seq_next, .stop = pfkey_seq_stop, .show = pfkey_seq_show, }; static int __net_init pfkey_init_proc(struct net *net) { struct proc_dir_entry *e; e = proc_create_net("pfkey", 0, net->proc_net, &pfkey_seq_ops, sizeof(struct seq_net_private)); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit pfkey_exit_proc(struct net *net) { remove_proc_entry("pfkey", net->proc_net); } #else static inline int pfkey_init_proc(struct net *net) { return 0; } static inline void pfkey_exit_proc(struct net *net) { } #endif static struct xfrm_mgr pfkeyv2_mgr = { .notify = pfkey_send_notify, .acquire = pfkey_send_acquire, .compile_policy = pfkey_compile_policy, .new_mapping = pfkey_send_new_mapping, .notify_policy = pfkey_send_policy_notify, .migrate = pfkey_send_migrate, .is_alive = pfkey_is_alive, }; static int __net_init pfkey_net_init(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); int rv; INIT_HLIST_HEAD(&net_pfkey->table); atomic_set(&net_pfkey->socks_nr, 0); rv = pfkey_init_proc(net); return rv; } static void __net_exit pfkey_net_exit(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_exit_proc(net); WARN_ON(!hlist_empty(&net_pfkey->table)); } static struct pernet_operations pfkey_net_ops = { .init = pfkey_net_init, .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), }; static void __exit ipsec_pfkey_exit(void) { xfrm_unregister_km(&pfkeyv2_mgr); sock_unregister(PF_KEY); unregister_pernet_subsys(&pfkey_net_ops); proto_unregister(&key_proto); } static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); if (err != 0) goto out; err = register_pernet_subsys(&pfkey_net_ops); if (err != 0) goto out_unregister_key_proto; err = sock_register(&pfkey_family_ops); if (err != 0) goto out_unregister_pernet; xfrm_register_km(&pfkeyv2_mgr); out: return err; out_unregister_pernet: unregister_pernet_subsys(&pfkey_net_ops); out_unregister_key_proto: proto_unregister(&key_proto); goto out; } module_init(ipsec_pfkey_init); module_exit(ipsec_pfkey_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KEY); |
8 8 8 13 12 11 9 8 13 8 8 8 8 3 3 8 8 8 8 8 8 3 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> #include <linux/rcupdate.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (!map->ops->map_local_storage_charge) return 0; return map->ops->map_local_storage_charge(smap, owner, size); } static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (map->ops->map_local_storage_uncharge) map->ops->map_local_storage_uncharge(smap, owner, size); } static struct bpf_local_storage __rcu ** owner_storage(struct bpf_local_storage_map *smap, void *owner) { struct bpf_map *map = &smap->map; return map->ops->map_owner_storage_ptr(owner); } static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed_lockless(&selem->snode); } static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); } static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed_lockless(&selem->map_node); } static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); } struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; if (smap->bpf_ma) { migrate_disable(); selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); migrate_enable(); if (selem) /* Keep the original bpf_map_kzalloc behavior * before started using the bpf_mem_cache_alloc. * * No need to use zero_map_value. The bpf_selem_free() * only does bpf_mem_cache_free when there is * no other bpf prog is using the selem. */ memset(SDATA(selem)->data, 0, smap->map.value_size); } else { selem = bpf_map_kzalloc(&smap->map, smap->elem_size, gfp_flags | __GFP_NOWARN); } if (selem) { if (value) copy_map_value(&smap->map, SDATA(selem)->data, value); /* No need to call check_and_init_map_value as memory is zero init */ return selem; } if (charge_mem) mem_uncharge(smap, owner, smap->elem_size); return NULL; } /* rcu tasks trace callback for bpf_ma == false */ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; /* If RCU Tasks Trace grace period implies RCU grace period, do * kfree(), else do kfree_rcu(). */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); if (rcu_trace_implies_rcu_gp()) kfree(local_storage); else kfree_rcu(local_storage, rcu); } static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); bpf_mem_cache_raw_free(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_local_storage_free_rcu(rcu); else call_rcu(rcu, bpf_local_storage_free_rcu); } /* Handle bpf_ma == false */ static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, bool vanilla_rcu) { if (vanilla_rcu) kfree_rcu(local_storage, rcu); else call_rcu_tasks_trace(&local_storage->rcu, __bpf_local_storage_free_trace_rcu); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool bpf_ma, bool reuse_now) { if (!local_storage) return; if (!bpf_ma) { __bpf_local_storage_free(local_storage, reuse_now); return; } if (!reuse_now) { call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_trace_rcu); return; } if (smap) { migrate_disable(); bpf_mem_cache_free(&smap->storage_ma, local_storage); migrate_enable(); } else { /* smap could be NULL if the selem that triggered * this 'local_storage' creation had been long gone. * In this case, directly do call_rcu(). */ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); } } /* rcu tasks trace callback for bpf_ma == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); if (rcu_trace_implies_rcu_gp()) kfree(selem); else kfree_rcu(selem, rcu); } /* Handle bpf_ma == false */ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { if (vanilla_rcu) kfree_rcu(selem, rcu); else call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); } static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); bpf_mem_cache_raw_free(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_selem_free_rcu(rcu); else call_rcu(rcu, bpf_selem_free_rcu); } void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, bool reuse_now) { bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); if (!smap->bpf_ma) { __bpf_selem_free(selem, reuse_now); return; } if (!reuse_now) { call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); } else { /* Instead of using the vanilla call_rcu(), * bpf_mem_cache_free will be able to reuse selem * immediately. */ migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); migrate_enable(); } } /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, bool uncharge_mem, bool reuse_now) { struct bpf_local_storage_map *smap; bool free_local_storage; void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); owner = local_storage->owner; /* All uncharging on the owner must be done first. * The owner may be freed once the last selem is unlinked * from local_storage. */ if (uncharge_mem) mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); if (free_local_storage) { mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); local_storage->owner = NULL; /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); /* local_storage is not freed now. local_storage->lock is * still held and raw_spin_unlock_bh(&local_storage->lock) * will be done by the caller. * * Although the unlock will be done under * rcu_read_lock(), it is more intuitive to * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * * Hence, a "bool free_local_storage" is returned * to the caller which then calls then frees the storage after * all the RCU grace periods have expired. */ } hlist_del_init_rcu(&selem->snode); if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); bpf_selem_free(selem, smap, reuse_now); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); return free_local_storage; } static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *storage_smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *selem_smap; /* local_storage->smap may be NULL. If it is, get the bpf_ma * from any selem in the local_storage->list. The bpf_ma of all * local_storage and selem should have the same value * for the same map type. * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not * responsibile to free the local_storage. */ if (storage_smap) return storage_smap->bpf_ma; if (!selem) { struct hlist_node *n; n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), bpf_rcu_lock_held()); if (!n) return false; selem = hlist_entry(n, struct bpf_local_storage_elem, snode); } selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); return selem_smap->bpf_ma; } static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; bool bpf_ma, free_local_storage = false; unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true, reuse_now); raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ return; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); unsigned long flags; raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); bpf_selem_unlink_storage(selem, reuse_now); } /* If cacheit_lockit is false, this lookup function is lockless */ struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit) { struct bpf_local_storage_data *sdata; struct bpf_local_storage_elem *selem; /* Fast path (cache hit) */ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], bpf_rcu_lock_held()); if (sdata && rcu_access_pointer(sdata->smap) == smap) return sdata; /* Slow path (cache miss) */ hlist_for_each_entry_rcu(selem, &local_storage->list, snode, rcu_read_lock_trace_held()) if (rcu_access_pointer(SDATA(selem)->smap) == smap) break; if (!selem) return NULL; sdata = SDATA(selem); if (cacheit_lockit) { unsigned long flags; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], sdata); raw_spin_unlock_irqrestore(&local_storage->lock, flags); } return sdata; } static int check_flags(const struct bpf_local_storage_data *old_sdata, u64 map_flags) { if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; return 0; } int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *first_selem, gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; int err; err = mem_charge(smap, owner, sizeof(*storage)); if (err) return err; if (smap->bpf_ma) { migrate_disable(); storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); migrate_enable(); } else { storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); } if (!storage) { err = -ENOMEM; goto uncharge; } RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); /* Publish storage to the owner. * Instead of using any lock of the kernel object (i.e. owner), * cmpxchg will work with any kernel object regardless what * the running context is, bh, irq...etc. * * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) * is protected by the storage->lock. Hence, when freeing * the owner->storage, the storage->lock must be held before * setting owner->storage ptr to NULL. */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; /* Note that even first_selem was linked to smap's * bucket->list, first_selem can be freed immediately * (instead of kfree_rcu) because * bpf_local_storage_map_free() does a * synchronize_rcu_mult (waiting for both sleepable and * normal programs) before walking the bucket->list. * Hence, no one is accessing selem from the * bucket->list under rcu_read_lock(). */ } return 0; uncharge: bpf_local_storage_free(storage, smap, smap->bpf_ma, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } /* sk cannot be going away because it is linking new elem * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). * Otherwise, it will become a leak (and other memory issues * during map destruction). */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) return ERR_PTR(-EINVAL); local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); if (err) return ERR_PTR(err); selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { bpf_selem_free(selem, smap, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } return SDATA(selem); } if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { /* Hoping to find an old_sdata to do inline update * such that it can avoid taking the local_storage->lock * and changing the lists. */ old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) return ERR_PTR(err); if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; } } /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { /* A parallel del is happening and local_storage is going * away. It has just been checked before, so very * unlikely. Return instead of retry to keep things * simple. */ err = -EAGAIN; goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); selem = SELEM(old_sdata); goto unlock; } alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), true, false); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, smap, true); } return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) { u64 min_usage = U64_MAX; u16 i, res = 0; spin_lock(&cache->idx_lock); for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { if (cache->idx_usage_counts[i] < min_usage) { min_usage = cache->idx_usage_counts[i]; res = i; /* Found a free cache_idx */ if (!min_usage) break; } } cache->idx_usage_counts[res]++; spin_unlock(&cache->idx_lock); return res; } static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, u16 idx) { spin_lock(&cache->idx_lock); cache->idx_usage_counts[idx]--; spin_unlock(&cache->idx_lock); } int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->max_entries || attr->key_size != sizeof(int) || !attr->value_size || /* Enforce BTF for userspace sk dumping */ !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) return -E2BIG; return 0; } int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { u32 int_data; if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; } void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; bool bpf_ma, free_storage = false; struct hlist_node *n; unsigned long flags; storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the * local_storage->list by the bpf_prog or by the bpf_map's syscall. * * It is racing with bpf_local_storage_map_free() alone * when unlinking elem from the local_storage->list and * the map's bucket->list. */ raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. */ bpf_selem_unlink_map(selem); /* If local_storage list has only one element, the * bpf_selem_unlink_storage_nolock() will return true. * Otherwise, it will return false. The current loop iteration * intends to remove all local storage. So the last iteration * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) { struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map; u64 usage = sizeof(*smap); /* The dynamically callocated selems are not counted currently. */ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log); return usage; } /* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. * A deadlock free allocator is useful for storage that the bpf prog can easily * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses * memory immediately. To be reuse-immediate safe, the owner destruction * code path needs to go through a rcu grace period before calling * bpf_local_storage_destroy(). * * When bpf_ma == false, the kmalloc and kfree are used. */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, bool bpf_ma) { struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; int err; smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; } for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); raw_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); smap->bpf_ma = bpf_ma; if (bpf_ma) { err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); if (err) goto free_smap; err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); if (err) { bpf_mem_alloc_destroy(&smap->selem_ma); goto free_smap; } } smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; free_smap: kvfree(smap->buckets); bpf_map_area_free(smap); return ERR_PTR(err); } void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache, int __percpu *busy_counter) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; unsigned int i; smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(cache, smap->cache_idx); /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU * read sections should be prevented via bpf_map_inc_not_zero. */ synchronize_rcu(); /* bpf prog and the userspace can no longer access this map * now. No new selem (of this map) can be added * to the owner->storage or to the map bucket's list. * * The elem of this map can be cleaned up here * or when the storage is freed e.g. * by bpf_sk_storage_free() during __sk_destruct(). */ for (i = 0; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; rcu_read_lock(); /* No one is adding to b->list now */ while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { if (busy_counter) { migrate_disable(); this_cpu_inc(*busy_counter); } bpf_selem_unlink(selem, true); if (busy_counter) { this_cpu_dec(*busy_counter); migrate_enable(); } cond_resched_rcu(); } rcu_read_unlock(); } /* While freeing the storage we may still need to access the map. * * e.g. when bpf_sk_storage_free() has unlinked selem from the map * which then made the above while((selem = ...)) loop * exit immediately. * * However, while freeing the storage one still needs to access the * smap->elem_size to do the uncharging in * bpf_selem_unlink_storage_nolock(). * * Hence, wait another rcu grace period for the storage to be freed. */ synchronize_rcu(); if (smap->bpf_ma) { bpf_mem_alloc_destroy(&smap->selem_ma); bpf_mem_alloc_destroy(&smap->storage_ma); } kvfree(smap->buckets); bpf_map_area_free(smap); } |
60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | // SPDX-License-Identifier: GPL-2.0 /* * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC */ #include <crypto/hash.h> #include <crypto/sha2.h> #include "fscrypt_private.h" /* * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses * SHA-512 because it is well-established, secure, and reasonably efficient. * * HKDF-SHA256 was also considered, as its 256-bit security strength would be * sufficient here. A 512-bit security strength is "nice to have", though. * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the * common case of deriving an AES-256-XTS key (512 bits), that can result in * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ #define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE /* * HKDF consists of two steps: * * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from * the input keying material and optional salt. * 2. HKDF-Expand: expand the pseudorandom key into output keying material of * any length, parameterized by an application-specific info string. * * HKDF-Extract can be skipped if the input is already a pseudorandom key of * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take * shorter keys, and we don't want to force users of those modes to provide * unnecessarily long master keys. Thus fscrypt still does HKDF-Extract. No * salt is used, since fscrypt master keys should already be pseudorandom and * there's no way to persist a random salt per master key from kernel mode. */ /* HKDF-Extract (RFC 5869 section 2.2), unsalted */ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) { static const u8 default_salt[HKDF_HASHLEN]; int err; err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); if (err) return err; return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); } /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. * * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many * times without having to recompute HKDF-Extract each time. */ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; u8 prk[HKDF_HASHLEN]; int err; hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); if (IS_ERR(hmac_tfm)) { fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", PTR_ERR(hmac_tfm)); return PTR_ERR(hmac_tfm); } if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { err = -EINVAL; goto err_free_tfm; } err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); if (err) goto err_free_tfm; err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); if (err) goto err_free_tfm; hkdf->hmac_tfm = hmac_tfm; goto out; err_free_tfm: crypto_free_shash(hmac_tfm); out: memzero_explicit(prk, sizeof(prk)); return err; } /* * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' * bytes of output keying material parameterized by the application-specific * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' * byte. This is thread-safe and may be called by multiple threads in parallel. * * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); u8 prefix[9]; unsigned int i; int err; const u8 *prev = NULL; u8 counter = 1; u8 tmp[HKDF_HASHLEN]; if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) return -EINVAL; desc->tfm = hkdf->hmac_tfm; memcpy(prefix, "fscrypt\0", 8); prefix[8] = context; for (i = 0; i < okmlen; i += HKDF_HASHLEN) { err = crypto_shash_init(desc); if (err) goto out; if (prev) { err = crypto_shash_update(desc, prev, HKDF_HASHLEN); if (err) goto out; } err = crypto_shash_update(desc, prefix, sizeof(prefix)); if (err) goto out; err = crypto_shash_update(desc, info, infolen); if (err) goto out; BUILD_BUG_ON(sizeof(counter) != 1); if (okmlen - i < HKDF_HASHLEN) { err = crypto_shash_finup(desc, &counter, 1, tmp); if (err) goto out; memcpy(&okm[i], tmp, okmlen - i); memzero_explicit(tmp, sizeof(tmp)); } else { err = crypto_shash_finup(desc, &counter, 1, &okm[i]); if (err) goto out; } counter++; prev = &okm[i]; } err = 0; out: if (unlikely(err)) memzero_explicit(okm, okmlen); /* so caller doesn't need to */ shash_desc_zero(desc); return err; } void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) { crypto_free_shash(hkdf->hmac_tfm); } |
397 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | // SPDX-License-Identifier: GPL-2.0 /* * This is a maximally equidistributed combined Tausworthe generator * based on code from GNU Scientific Library 1.5 (30 Jun 2004) * * lfsr113 version: * * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) * * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) * s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) * s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) * * The period of this generator is about 2^113 (see erratum paper). * * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe * Generators", Mathematics of Computation, 65, 213 (1996), 203--213: * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps * * There is an erratum in the paper "Tables of Maximally Equidistributed * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999), * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps * * ... the k_j most significant bits of z_j must be non-zero, * for each j. (Note: this restriction also applies to the * computer code given in [4], but was mistakenly not mentioned * in that paper.) * * This affects the seeding procedure by imposing the requirement * s1 > 1, s2 > 7, s3 > 15, s4 > 127. */ #include <linux/types.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/slab.h> #include <asm/unaligned.h> /** * prandom_u32_state - seeded pseudo-random number generator. * @state: pointer to state structure holding seeded state. * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_u32(). */ u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b) state->s1 = TAUSWORTHE(state->s1, 6U, 13U, 4294967294U, 18U); state->s2 = TAUSWORTHE(state->s2, 2U, 27U, 4294967288U, 2U); state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U, 7U); state->s4 = TAUSWORTHE(state->s4, 3U, 12U, 4294967168U, 13U); return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4); } EXPORT_SYMBOL(prandom_u32_state); /** * prandom_bytes_state - get the requested number of pseudo-random bytes * * @state: pointer to state structure holding seeded state. * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_bytes(). */ void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { u8 *ptr = buf; while (bytes >= sizeof(u32)) { put_unaligned(prandom_u32_state(state), (u32 *) ptr); ptr += sizeof(u32); bytes -= sizeof(u32); } if (bytes > 0) { u32 rem = prandom_u32_state(state); do { *ptr++ = (u8) rem; bytes--; rem >>= BITS_PER_BYTE; } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); static void prandom_warmup(struct rnd_state *state) { /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); } void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state) { int i; for_each_possible_cpu(i) { struct rnd_state *state = per_cpu_ptr(pcpu_state, i); u32 seeds[4]; get_random_bytes(&seeds, sizeof(seeds)); state->s1 = __seed(seeds[0], 2U); state->s2 = __seed(seeds[1], 8U); state->s3 = __seed(seeds[2], 16U); state->s4 = __seed(seeds[3], 128U); prandom_warmup(state); } } EXPORT_SYMBOL(prandom_seed_full_state); #ifdef CONFIG_RANDOM32_SELFTEST static struct prandom_test1 { u32 seed; u32 result; } test1[] = { { 1U, 3484351685U }, { 2U, 2623130059U }, { 3U, 3125133893U }, { 4U, 984847254U }, }; static struct prandom_test2 { u32 seed; u32 iteration; u32 result; } test2[] = { /* Test cases against taus113 from GSL library. */ { 931557656U, 959U, 2975593782U }, { 1339693295U, 876U, 3887776532U }, { 1545556285U, 961U, 1615538833U }, { 601730776U, 723U, 1776162651U }, { 1027516047U, 687U, 511983079U }, { 416526298U, 700U, 916156552U }, { 1395522032U, 652U, 2222063676U }, { 366221443U, 617U, 2992857763U }, { 1539836965U, 714U, 3783265725U }, { 556206671U, 994U, 799626459U }, { 684907218U, 799U, 367789491U }, { 2121230701U, 931U, 2115467001U }, { 1668516451U, 644U, 3620590685U }, { 768046066U, 883U, 2034077390U }, { 1989159136U, 833U, 1195767305U }, { 536585145U, 996U, 3577259204U }, { 1008129373U, 642U, 1478080776U }, { 1740775604U, 939U, 1264980372U }, { 1967883163U, 508U, 10734624U }, { 1923019697U, 730U, 3821419629U }, { 442079932U, 560U, 3440032343U }, { 1961302714U, 845U, 841962572U }, { 2030205964U, 962U, 1325144227U }, { 1160407529U, 507U, 240940858U }, { 635482502U, 779U, 4200489746U }, { 1252788931U, 699U, 867195434U }, { 1961817131U, 719U, 668237657U }, { 1071468216U, 983U, 917876630U }, { 1281848367U, 932U, 1003100039U }, { 582537119U, 780U, 1127273778U }, { 1973672777U, 853U, 1071368872U }, { 1896756996U, 762U, 1127851055U }, { 847917054U, 500U, 1717499075U }, { 1240520510U, 951U, 2849576657U }, { 1685071682U, 567U, 1961810396U }, { 1516232129U, 557U, 3173877U }, { 1208118903U, 612U, 1613145022U }, { 1817269927U, 693U, 4279122573U }, { 1510091701U, 717U, 638191229U }, { 365916850U, 807U, 600424314U }, { 399324359U, 702U, 1803598116U }, { 1318480274U, 779U, 2074237022U }, { 697758115U, 840U, 1483639402U }, { 1696507773U, 840U, 577415447U }, { 2081979121U, 981U, 3041486449U }, { 955646687U, 742U, 3846494357U }, { 1250683506U, 749U, 836419859U }, { 595003102U, 534U, 366794109U }, { 47485338U, 558U, 3521120834U }, { 619433479U, 610U, 3991783875U }, { 704096520U, 518U, 4139493852U }, { 1712224984U, 606U, 2393312003U }, { 1318233152U, 922U, 3880361134U }, { 855572992U, 761U, 1472974787U }, { 64721421U, 703U, 683860550U }, { 678931758U, 840U, 380616043U }, { 692711973U, 778U, 1382361947U }, { 677703619U, 530U, 2826914161U }, { 92393223U, 586U, 1522128471U }, { 1222592920U, 743U, 3466726667U }, { 358288986U, 695U, 1091956998U }, { 1935056945U, 958U, 514864477U }, { 735675993U, 990U, 1294239989U }, { 1560089402U, 897U, 2238551287U }, { 70616361U, 829U, 22483098U }, { 368234700U, 731U, 2913875084U }, { 20221190U, 879U, 1564152970U }, { 539444654U, 682U, 1835141259U }, { 1314987297U, 840U, 1801114136U }, { 2019295544U, 645U, 3286438930U }, { 469023838U, 716U, 1637918202U }, { 1843754496U, 653U, 2562092152U }, { 400672036U, 809U, 4264212785U }, { 404722249U, 965U, 2704116999U }, { 600702209U, 758U, 584979986U }, { 519953954U, 667U, 2574436237U }, { 1658071126U, 694U, 2214569490U }, { 420480037U, 749U, 3430010866U }, { 690103647U, 969U, 3700758083U }, { 1029424799U, 937U, 3787746841U }, { 2012608669U, 506U, 3362628973U }, { 1535432887U, 998U, 42610943U }, { 1330635533U, 857U, 3040806504U }, { 1223800550U, 539U, 3954229517U }, { 1322411537U, 680U, 3223250324U }, { 1877847898U, 945U, 2915147143U }, { 1646356099U, 874U, 965988280U }, { 805687536U, 744U, 4032277920U }, { 1948093210U, 633U, 1346597684U }, { 392609744U, 783U, 1636083295U }, { 690241304U, 770U, 1201031298U }, { 1360302965U, 696U, 1665394461U }, { 1220090946U, 780U, 1316922812U }, { 447092251U, 500U, 3438743375U }, { 1613868791U, 592U, 828546883U }, { 523430951U, 548U, 2552392304U }, { 726692899U, 810U, 1656872867U }, { 1364340021U, 836U, 3710513486U }, { 1986257729U, 931U, 935013962U }, { 407983964U, 921U, 728767059U }, }; static void prandom_state_selftest_seed(struct rnd_state *state, u32 seed) { #define LCG(x) ((x) * 69069U) /* super-duper LCG */ state->s1 = __seed(LCG(seed), 2U); state->s2 = __seed(LCG(state->s1), 8U); state->s3 = __seed(LCG(state->s2), 16U); state->s4 = __seed(LCG(state->s3), 128U); } static int __init prandom_state_selftest(void) { int i, j, errors = 0, runs = 0; bool error = false; for (i = 0; i < ARRAY_SIZE(test1); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test1[i].seed); prandom_warmup(&state); if (test1[i].result != prandom_u32_state(&state)) error = true; } if (error) pr_warn("prandom: seed boundary self test failed\n"); else pr_info("prandom: seed boundary self test passed\n"); for (i = 0; i < ARRAY_SIZE(test2); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test2[i].seed); prandom_warmup(&state); for (j = 0; j < test2[i].iteration - 1; j++) prandom_u32_state(&state); if (test2[i].result != prandom_u32_state(&state)) errors++; runs++; cond_resched(); } if (errors) pr_warn("prandom: %d/%d self tests failed\n", errors, runs); else pr_info("prandom: %d self tests passed\n", runs); return 0; } core_initcall(prandom_state_selftest); #endif |
2598 2598 2598 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0-only /* * lib/hexdump.c */ #include <linux/types.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/minmax.h> #include <linux/export.h> #include <asm/unaligned.h> const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); const char hex_asc_upper[] = "0123456789ABCDEF"; EXPORT_SYMBOL(hex_asc_upper); /** * hex_to_bin - convert a hex digit to its real value * @ch: ascii character represents hex digit * * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad * input. * * This function is used to load cryptographic keys, so it is coded in such a * way that there are no conditions or memory accesses that depend on data. * * Explanation of the logic: * (ch - '9' - 1) is negative if ch <= '9' * ('0' - 1 - ch) is negative if ch >= '0' * we "and" these two values, so the result is negative if ch is in the range * '0' ... '9' * we are only interested in the sign, so we do a shift ">> 8"; note that right * shift of a negative value is implementation-defined, so we cast the * value to (unsigned) before the shift --- we have 0xffffff if ch is in * the range '0' ... '9', 0 otherwise * we "and" this value with (ch - '0' + 1) --- we have a value 1 ... 10 if ch is * in the range '0' ... '9', 0 otherwise * we add this value to -1 --- we have a value 0 ... 9 if ch is in the range '0' * ... '9', -1 otherwise * the next line is similar to the previous one, but we need to decode both * uppercase and lowercase letters, so we use (ch & 0xdf), which converts * lowercase to uppercase */ int hex_to_bin(unsigned char ch) { unsigned char cu = ch & 0xdf; return -1 + ((ch - '0' + 1) & (unsigned)((ch - '9' - 1) & ('0' - 1 - ch)) >> 8) + ((cu - 'A' + 11) & (unsigned)((cu - 'F' - 1) & ('A' - 1 - cu)) >> 8); } EXPORT_SYMBOL(hex_to_bin); /** * hex2bin - convert an ascii hexadecimal string to its binary representation * @dst: binary result * @src: ascii hexadecimal string * @count: result length * * Return 0 on success, -EINVAL in case of bad input. */ int hex2bin(u8 *dst, const char *src, size_t count) { while (count--) { int hi, lo; hi = hex_to_bin(*src++); if (unlikely(hi < 0)) return -EINVAL; lo = hex_to_bin(*src++); if (unlikely(lo < 0)) return -EINVAL; *dst++ = (hi << 4) | lo; } return 0; } EXPORT_SYMBOL(hex2bin); /** * bin2hex - convert binary data to an ascii hexadecimal string * @dst: ascii hexadecimal result * @src: binary data * @count: binary data length */ char *bin2hex(char *dst, const void *src, size_t count) { const unsigned char *_src = src; while (count--) dst = hex_byte_pack(dst, *_src++); return dst; } EXPORT_SYMBOL(bin2hex); /** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump * @len: number of bytes in the @buf * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @linebuf: where to put the converted data * @linebuflen: total size of @linebuf, including space for terminating NUL * @ascii: include ASCII after the hex output * * hex_dump_to_buffer() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data * to a hex + ASCII dump at the supplied memory location. * The converted output is always NUL-terminated. * * E.g.: * hex_dump_to_buffer(frame->data, frame->len, 16, 1, * linebuf, sizeof(linebuf), true); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * * Return: * The amount of bytes placed in the buffer without terminating NUL. If the * output was truncated, then the return value is the number of bytes * (excluding the terminating NUL) which would have been written to the final * string if enough space had been available. */ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii) { const u8 *ptr = buf; int ngroups; u8 ch; int j, lx = 0; int ascii_column; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; if (len > rowsize) /* limit to one line at a time */ len = rowsize; if (!is_power_of_2(groupsize) || groupsize > 8) groupsize = 1; if ((len % groupsize) != 0) /* no mixed size output */ groupsize = 1; ngroups = len / groupsize; ascii_column = rowsize * 2 + rowsize / groupsize + 1; if (!linebuflen) goto overflow1; if (!len) goto nil; if (groupsize == 8) { const u64 *ptr8 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%16.16llx", j ? " " : "", get_unaligned(ptr8 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 4) { const u32 *ptr4 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%8.8x", j ? " " : "", get_unaligned(ptr4 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 2) { const u16 *ptr2 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%4.4x", j ? " " : "", get_unaligned(ptr2 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else { for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = hex_asc_lo(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } if (j) lx--; } if (!ascii) goto nil; while (lx < ascii_column) { if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; } nil: linebuf[lx] = '\0'; return lx; overflow2: linebuf[lx++] = '\0'; overflow1: return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; } EXPORT_SYMBOL(hex_dump_to_buffer); #ifdef CONFIG_PRINTK /** * print_hex_dump - print a text hex dump to syslog for a binary blob of data * @level: kernel log level (e.g. KERN_DEBUG) * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @buf: data blob to dump * @len: number of bytes in the @buf * @ascii: include ASCII after the hex output * * Given a buffer of u8 data, print_hex_dump() prints a hex + ASCII dump * to the kernel log at the specified kernel log level, with an optional * leading prefix. * * print_hex_dump() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * print_hex_dump() iterates over the entire input @buf, breaking it into * "line size" chunks to format and print. * * E.g.: * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS, * 16, 1, frame->data, frame->len, true); * * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode: * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * Example output using %DUMP_PREFIX_ADDRESS and 4-byte mode: * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~. */ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: printk("%s%s%p: %s\n", level, prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf); break; default: printk("%s%s%s\n", level, prefix_str, linebuf); break; } } } EXPORT_SYMBOL(print_hex_dump); #endif /* defined(CONFIG_PRINTK) */ |
6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2017 Netronome Systems, Inc. * Copyright (C) 2019 Mellanox Technologies. All rights reserved */ #include <linux/completion.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/sysfs.h> #include "netdevsim.h" static DEFINE_IDA(nsim_bus_dev_ids); static LIST_HEAD(nsim_bus_dev_list); static DEFINE_MUTEX(nsim_bus_dev_list_lock); static bool nsim_bus_enable; static refcount_t nsim_bus_devs; /* Including the bus itself. */ static DECLARE_COMPLETION(nsim_bus_devs_released); static struct nsim_bus_dev *to_nsim_bus_dev(struct device *dev) { return container_of(dev, struct nsim_bus_dev, dev); } static ssize_t nsim_bus_dev_numvfs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int num_vfs; int ret; ret = kstrtouint(buf, 0, &num_vfs); if (ret) return ret; device_lock(dev); ret = -ENOENT; if (dev_get_drvdata(dev)) ret = nsim_drv_configure_vfs(nsim_bus_dev, num_vfs); device_unlock(dev); return ret ? ret : count; } static ssize_t nsim_bus_dev_numvfs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return sprintf(buf, "%u\n", nsim_bus_dev->num_vfs); } static struct device_attribute nsim_bus_dev_numvfs_attr = __ATTR(sriov_numvfs, 0664, nsim_bus_dev_numvfs_show, nsim_bus_dev_numvfs_store); static ssize_t new_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_new_port_attr = __ATTR_WO(new_port); static ssize_t del_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_del_port_attr = __ATTR_WO(del_port); static struct attribute *nsim_bus_dev_attrs[] = { &nsim_bus_dev_numvfs_attr.attr, &nsim_bus_dev_new_port_attr.attr, &nsim_bus_dev_del_port_attr.attr, NULL, }; static const struct attribute_group nsim_bus_dev_attr_group = { .attrs = nsim_bus_dev_attrs, }; static const struct attribute_group *nsim_bus_dev_attr_groups[] = { &nsim_bus_dev_attr_group, NULL, }; static void nsim_bus_dev_release(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev; nsim_bus_dev = container_of(dev, struct nsim_bus_dev, dev); kfree(nsim_bus_dev); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); } static struct device_type nsim_bus_dev_type = { .groups = nsim_bus_dev_attr_groups, .release = nsim_bus_dev_release, }; static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues); static ssize_t new_device_store(const struct bus_type *bus, const char *buf, size_t count) { unsigned int id, port_count, num_queues; struct nsim_bus_dev *nsim_bus_dev; int err; err = sscanf(buf, "%u %u %u", &id, &port_count, &num_queues); switch (err) { case 1: port_count = 1; fallthrough; case 2: num_queues = 1; fallthrough; case 3: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for adding new device is \"id port_count num_queues\" (uint uint unit).\n"); return -EINVAL; } mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { err = -EBUSY; goto err; } nsim_bus_dev = nsim_bus_dev_new(id, port_count, num_queues); if (IS_ERR(nsim_bus_dev)) { err = PTR_ERR(nsim_bus_dev); goto err; } refcount_inc(&nsim_bus_devs); /* Allow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, true); list_add_tail(&nsim_bus_dev->list, &nsim_bus_dev_list); mutex_unlock(&nsim_bus_dev_list_lock); return count; err: mutex_unlock(&nsim_bus_dev_list_lock); return err; } static BUS_ATTR_WO(new_device); static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev); static ssize_t del_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev, *tmp; unsigned int id; int err; err = sscanf(buf, "%u", &id); switch (err) { case 1: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for deleting device is \"id\" (uint).\n"); return -EINVAL; } err = -ENOENT; mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { mutex_unlock(&nsim_bus_dev_list_lock); return -EBUSY; } list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { if (nsim_bus_dev->dev.id != id) continue; list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); err = 0; break; } mutex_unlock(&nsim_bus_dev_list_lock); return !err ? count : err; } static BUS_ATTR_WO(del_device); static struct attribute *nsim_bus_attrs[] = { &bus_attr_new_device.attr, &bus_attr_del_device.attr, NULL }; ATTRIBUTE_GROUPS(nsim_bus); static int nsim_bus_probe(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_drv_probe(nsim_bus_dev); } static void nsim_bus_remove(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); nsim_drv_remove(nsim_bus_dev); } static int nsim_num_vf(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_bus_dev->num_vfs; } static struct bus_type nsim_bus = { .name = DRV_NAME, .dev_name = DRV_NAME, .bus_groups = nsim_bus_groups, .probe = nsim_bus_probe, .remove = nsim_bus_remove, .num_vf = nsim_num_vf, }; #define NSIM_BUS_DEV_MAX_VFS 4 static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues) { struct nsim_bus_dev *nsim_bus_dev; int err; nsim_bus_dev = kzalloc(sizeof(*nsim_bus_dev), GFP_KERNEL); if (!nsim_bus_dev) return ERR_PTR(-ENOMEM); err = ida_alloc_range(&nsim_bus_dev_ids, id, id, GFP_KERNEL); if (err < 0) goto err_nsim_bus_dev_free; nsim_bus_dev->dev.id = err; nsim_bus_dev->dev.bus = &nsim_bus; nsim_bus_dev->dev.type = &nsim_bus_dev_type; nsim_bus_dev->port_count = port_count; nsim_bus_dev->num_queues = num_queues; nsim_bus_dev->initial_net = current->nsproxy->net_ns; nsim_bus_dev->max_vfs = NSIM_BUS_DEV_MAX_VFS; /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); err = device_register(&nsim_bus_dev->dev); if (err) goto err_nsim_bus_dev_id_free; return nsim_bus_dev; err_nsim_bus_dev_id_free: ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); put_device(&nsim_bus_dev->dev); nsim_bus_dev = NULL; err_nsim_bus_dev_free: kfree(nsim_bus_dev); return ERR_PTR(err); } static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev) { /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); device_unregister(&nsim_bus_dev->dev); } static struct device_driver nsim_driver = { .name = DRV_NAME, .bus = &nsim_bus, .owner = THIS_MODULE, }; int nsim_bus_init(void) { int err; err = bus_register(&nsim_bus); if (err) return err; err = driver_register(&nsim_driver); if (err) goto err_bus_unregister; refcount_set(&nsim_bus_devs, 1); /* Allow using resources */ smp_store_release(&nsim_bus_enable, true); return 0; err_bus_unregister: bus_unregister(&nsim_bus); return err; } void nsim_bus_exit(void) { struct nsim_bus_dev *nsim_bus_dev, *tmp; /* Disallow using resources */ smp_store_release(&nsim_bus_enable, false); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); mutex_lock(&nsim_bus_dev_list_lock); list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); } mutex_unlock(&nsim_bus_dev_list_lock); wait_for_completion(&nsim_bus_devs_released); driver_unregister(&nsim_driver); bus_unregister(&nsim_bus); } |
562 177 177 140 141 141 141 141 141 141 141 141 140 141 141 141 141 141 141 562 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2002 International Business Machines, Corp. * * This file is part of the SCTP kernel implementation * * These functions are the methods for accessing the SCTP inqueue. * * An SCTP inqueue is a queue into which you push SCTP packets * (which might be bundles or fragments of chunks) and out of which you * pop SCTP whole chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <linux/interrupt.h> #include <linux/slab.h> /* Initialize an SCTP inqueue. */ void sctp_inq_init(struct sctp_inq *queue) { INIT_LIST_HEAD(&queue->in_chunk_list); queue->in_progress = NULL; /* Create a task for delivering data. */ INIT_WORK(&queue->immediate, NULL); } /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { struct sctp_chunk *chunk, *tmp; /* Empty the queue. */ list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } /* If there is a packet which is currently being worked on, * free it as well. */ if (queue->in_progress) { sctp_chunk_free(queue->in_progress); queue->in_progress = NULL; } } /* Put a new packet in an SCTP inqueue. * We assume that packet->sctp_hdr is set and in host byte order. */ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk) { /* Directly call the packet handling routine. */ if (chunk->rcvr->dead) { sctp_chunk_free(chunk); return; } /* We are now calling this either from the soft interrupt * or from the backlog processing. * Eventually, we should clean up inqueue to not rely * on the BH related data structures. */ list_add_tail(&chunk->list, &q->in_chunk_list); if (chunk->asoc) chunk->asoc->stats.ipackets++; q->immediate.func(&q->immediate); } /* Peek at the next chunk on the inqeue. */ struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; chunk = queue->in_progress; /* If there is no more chunks in this packet, say so */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) return NULL; ch = (struct sctp_chunkhdr *)chunk->chunk_end; return ch; } /* Extract a chunk from an SCTP inqueue. * * WARNING: If you need to put the chunk on another queue, you need to * make a shallow copy (clone) of it. */ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; /* The assumption is that we are safe to process the chunks * at this time. */ chunk = queue->in_progress; if (chunk) { /* There is a packet that we have been working on. * Any post processing work to do before we move on? */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { if (chunk->head_skb == chunk->skb) { chunk->skb = skb_shinfo(chunk->skb)->frag_list; goto new_skb; } if (chunk->skb->next) { chunk->skb = chunk->skb->next; goto new_skb; } if (chunk->head_skb) chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { /* Nothing to do. Next chunk in the packet, please. */ ch = (struct sctp_chunkhdr *)chunk->chunk_end; /* Force chunk->skb->data to chunk->chunk_end. */ skb_pull(chunk->skb, chunk->chunk_end - chunk->skb->data); /* We are guaranteed to pull a SCTP header. */ } } /* Do we need to take the next packet out of the queue to process? */ if (!chunk) { struct list_head *entry; next_chunk: /* Is the queue empty? */ entry = sctp_list_dequeue(&queue->in_chunk_list); if (!entry) return NULL; chunk = list_entry(entry, struct sctp_chunk, list); if (skb_is_gso(chunk->skb) && skb_is_gso_sctp(chunk->skb)) { /* GSO-marked skbs but without frags, handle * them normally */ if (skb_shinfo(chunk->skb)->frag_list) chunk->head_skb = chunk->skb; /* skbs with "cover letter" */ if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) chunk->skb = skb_shinfo(chunk->skb)->frag_list; if (WARN_ON(!chunk->skb)) { __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); sctp_chunk_free(chunk); goto next_chunk; } } if (chunk->asoc) sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); queue->in_progress = chunk; new_skb: /* This is the first chunk in the packet. */ ch = (struct sctp_chunkhdr *)chunk->skb->data; chunk->singleton = 1; chunk->data_accepted = 0; chunk->pdiscard = 0; chunk->auth = 0; chunk->has_asconf = 0; chunk->end_of_packet = 0; if (chunk->head_skb) { struct sctp_input_cb *cb = SCTP_INPUT_CB(chunk->skb), *head_cb = SCTP_INPUT_CB(chunk->head_skb); cb->chunk = head_cb->chunk; cb->af = head_cb->af; } } chunk->chunk_hdr = ch; chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); skb_pull(chunk->skb, sizeof(*ch)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) { /* This is not a singleton */ chunk->singleton = 0; } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { /* Discard inside state machine. */ chunk->pdiscard = 1; chunk->chunk_end = skb_tail_pointer(chunk->skb); } else { /* We are at the end of the packet, so mark the chunk * in case we need to send a SACK. */ chunk->end_of_packet = 1; } pr_debug("+++sctp_inq_pop+++ chunk:%p[%s], length:%d, skb->len:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), ntohs(chunk->chunk_hdr->length), chunk->skb->len); return chunk; } /* Set a top-half handler. * * Originally, we the top-half handler was scheduled as a BH. We now * call the handler directly in sctp_inq_push() at a time that * we know we are lock safe. * The intent is that this routine will pull stuff out of the * inqueue and process it. */ void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback) { INIT_WORK(&q->immediate, callback); } |
8 7 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H #include <linux/in.h> #include <linux/interrupt.h> #include <linux/ktime.h> #include <linux/list.h> #include <linux/uio.h> #include <linux/workqueue.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/inet_timewait_sock.h> #include <net/tcp_states.h> #include <uapi/linux/dccp.h> enum dccp_state { DCCP_OPEN = TCP_ESTABLISHED, DCCP_REQUESTING = TCP_SYN_SENT, DCCP_LISTEN = TCP_LISTEN, DCCP_RESPOND = TCP_SYN_RECV, /* * States involved in closing a DCCP connection: * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. * * 2) CLOSING can have three different meanings (RFC 4340, 8.3): * a. Client has performed active-close, has sent a Close to the server * from state OPEN or PARTOPEN, and is waiting for the final Reset * (in this case, SOCK_DONE == 1). * b. Client is asked to perform passive-close, by receiving a CloseReq * in (PART)OPEN state. It sends a Close and waits for final Reset * (in this case, SOCK_DONE == 0). * c. Server performs an active-close as in (a), keeps TIMEWAIT state. * * 3) The following intermediate states are employed to give passively * closing nodes a chance to process their unread data: * - PASSIVE_CLOSE (from OPEN => CLOSED) and * - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). */ DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ DCCP_CLOSING = TCP_CLOSING, DCCP_TIME_WAIT = TCP_TIME_WAIT, DCCP_CLOSED = TCP_CLOSE, DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, DCCP_PARTOPEN = TCP_MAX_STATES, DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ DCCP_MAX_STATES }; enum { DCCPF_OPEN = TCPF_ESTABLISHED, DCCPF_REQUESTING = TCPF_SYN_SENT, DCCPF_LISTEN = TCPF_LISTEN, DCCPF_RESPOND = TCPF_SYN_RECV, DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, DCCPF_CLOSING = TCPF_CLOSING, DCCPF_TIME_WAIT = TCPF_TIME_WAIT, DCCPF_CLOSED = TCPF_CLOSE, DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), }; static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) { return (struct dccp_hdr *)skb_transport_header(skb); } static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen) { skb_push(skb, headlen); skb_reset_transport_header(skb); return memset(skb_transport_header(skb), 0, headlen); } static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh) { return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh)); } static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) { return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); } static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); return __dccp_basic_hdr_len(dh); } static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) { __u64 seq_nr = ntohs(dh->dccph_seq); if (dh->dccph_x != 0) seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(dh)->dccph_seq_low); else seq_nr += (u32)dh->dccph_seq2 << 16; return seq_nr; } static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) { return (struct dccp_hdr_request *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) { return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) { const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); } static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) { return (struct dccp_hdr_response *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) { return (struct dccp_hdr_reset *)(skb_transport_header(skb) + dccp_basic_hdr_len(skb)); } static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) { return __dccp_basic_hdr_len(dh) + dccp_packet_hdr_len(dh->dccph_type); } static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) { return __dccp_hdr_len(dccp_hdr(skb)); } /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1) * @dreq_gss: greatest sequence number sent (for retransmitted Responses) * @dreq_isr: initial sequence number received in the first Request * @dreq_gsr: greatest sequence number received (for retransmitted Request(s)) * @dreq_service: service code present on the Request (there is just one) * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo */ struct dccp_request_sock { struct inet_request_sock dreq_inet_rsk; __u64 dreq_iss; __u64 dreq_gss; __u64 dreq_isr; __u64 dreq_gsr; __be32 dreq_service; spinlock_t dreq_lock; struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) { return (struct dccp_request_sock *)req; } extern struct inet_timewait_death_row dccp_death_row; extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct sk_buff *skb); struct dccp_options_received { u64 dccpor_ndp:48; u32 dccpor_timestamp; u32 dccpor_timestamp_echo; u32 dccpor_elapsed_time; }; struct ccid; enum dccp_role { DCCP_ROLE_UNDEFINED, DCCP_ROLE_LISTEN, DCCP_ROLE_CLIENT, DCCP_ROLE_SERVER, }; struct dccp_service_list { __u32 dccpsl_nr; __be32 dccpsl_list[]; }; #define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) #define DCCP_SERVICE_CODE_IS_ABSENT 0 static inline bool dccp_list_has_service(const struct dccp_service_list *sl, const __be32 service) { if (likely(sl != NULL)) { u32 i = sl->dccpsl_nr; while (i--) if (sl->dccpsl_list[i] == service) return true; } return false; } struct dccp_ackvec; /** * struct dccp_sock - DCCP socket state * * @dccps_swl - sequence number window low * @dccps_swh - sequence number window high * @dccps_awl - acknowledgement number window low * @dccps_awh - acknowledgement number window high * @dccps_iss - initial sequence number sent * @dccps_isr - initial sequence number received * @dccps_osr - first OPEN sequence number received * @dccps_gss - greatest sequence number sent * @dccps_gsr - greatest valid sequence number received * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss * @dccps_service - first (passive sock) or unique (active sock) service code * @dccps_service_list - second .. last service code on passive socket * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio * @dccps_l_seq_win - local Sequence Window (influences ack number validity) * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) * @dccps_options_received - parsed set of retrieved options * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy * @dccps_tx_qlen - maximum length of the TX queue * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ struct dccp_sock { /* inet_connection_sock has to be the first member of dccp_sock */ struct inet_connection_sock dccps_inet_connection; #define dccps_syn_rtt dccps_inet_connection.icsk_ack.lrcvtime __u64 dccps_swl; __u64 dccps_swh; __u64 dccps_awl; __u64 dccps_awh; __u64 dccps_iss; __u64 dccps_isr; __u64 dccps_osr; __u64 dccps_gss; __u64 dccps_gsr; __u64 dccps_gar; __be32 dccps_service; __u32 dccps_mss_cache; struct dccp_service_list *dccps_service_list; __u32 dccps_timestamp_echo; __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; __u64 dccps_l_seq_win:48; __u64 dccps_r_seq_win:48; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; struct dccp_options_received dccps_options_received; __u8 dccps_qpolicy; __u32 dccps_tx_qlen; enum dccp_role dccps_role:2; __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; __u8 dccps_sync_scheduled:1; struct tasklet_struct dccps_xmitlet; struct timer_list dccps_xmit_timer; }; #define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ dccps_inet_connection.icsk_inet.sk) static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { case DCCP_ROLE_UNDEFINED: return "undefined"; case DCCP_ROLE_LISTEN: return "listen"; case DCCP_ROLE_SERVER: return "server"; case DCCP_ROLE_CLIENT: return "client"; } return NULL; } extern void dccp_syn_ack_timeout(const struct request_sock *req); #endif /* _LINUX_DCCP_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #include "main.h" #include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_reselect(struct batadv_priv *bat_priv); void batadv_gw_election(struct batadv_priv *bat_priv); struct batadv_orig_node * batadv_gw_get_selected_orig(struct batadv_priv *bat_priv); void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway); void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_free(struct batadv_priv *bat_priv); void batadv_gw_node_release(struct kref *ref); struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, u8 *chaddr); struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); /** * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release * it * @gw_node: gateway node to free */ static inline void batadv_gw_node_put(struct batadv_gw_node *gw_node) { if (!gw_node) return; kref_put(&gw_node->refcount, batadv_gw_node_release); } #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ |
60 60 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #ifndef _BR_PRIVATE_STP_H #define _BR_PRIVATE_STP_H #define BPDU_TYPE_CONFIG 0 #define BPDU_TYPE_TCN 0x80 /* IEEE 802.1D-1998 timer values */ #define BR_MIN_HELLO_TIME (1*HZ) #define BR_MAX_HELLO_TIME (10*HZ) #define BR_MIN_FORWARD_DELAY (2*HZ) #define BR_MAX_FORWARD_DELAY (30*HZ) #define BR_MIN_MAX_AGE (6*HZ) #define BR_MAX_MAX_AGE (40*HZ) #define BR_MIN_PATH_COST 1 #define BR_MAX_PATH_COST 65535 struct br_config_bpdu { unsigned int topology_change:1; unsigned int topology_change_ack:1; bridge_id root; int root_path_cost; bridge_id bridge_id; port_id port_id; int message_age; int max_age; int hello_time; int forward_delay; }; /* called under bridge lock */ static inline int br_is_designated_port(const struct net_bridge_port *p) { return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) && (p->designated_port == p->port_id); } /* br_stp.c */ void br_become_root_bridge(struct net_bridge *br); void br_config_bpdu_generation(struct net_bridge *); void br_configuration_update(struct net_bridge *); void br_port_state_selection(struct net_bridge *); void br_received_config_bpdu(struct net_bridge_port *p, const struct br_config_bpdu *bpdu); void br_received_tcn_bpdu(struct net_bridge_port *p); void br_transmit_config(struct net_bridge_port *p); void br_transmit_tcn(struct net_bridge *br); void br_topology_change_detection(struct net_bridge *br); void __br_set_topology_change(struct net_bridge *br, unsigned char val); /* br_stp_bpdu.c */ void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *); void br_send_tcn_bpdu(struct net_bridge_port *); #endif |
108 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | // SPDX-License-Identifier: GPL-2.0-only /* * Software WEP encryption implementation * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2003, Instant802 Networks, Inc. * Copyright (C) 2023 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/random.h> #include <linux/compiler.h> #include <linux/crc32.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <asm/unaligned.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wep.h" void ieee80211_wep_init(struct ieee80211_local *local) { /* start WEP IV from a random value */ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN); } static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen) { /* * Fluhrer, Mantin, and Shamir have reported weaknesses in the * key scheduling algorithm of RC4. At least IVs (KeyByte + 3, * 0xff, N) can be used to speedup attacks, so avoid using them. */ if ((iv & 0xff00) == 0xff00) { u8 B = (iv >> 16) & 0xff; if (B >= 3 && B < 3 + keylen) return true; } return false; } static void ieee80211_wep_get_iv(struct ieee80211_local *local, int keylen, int keyidx, u8 *iv) { local->wep_iv++; if (ieee80211_wep_weak_iv(local->wep_iv, keylen)) local->wep_iv += 0x0100; if (!iv) return; *iv++ = (local->wep_iv >> 16) & 0xff; *iv++ = (local->wep_iv >> 8) & 0xff; *iv++ = local->wep_iv & 0xff; *iv++ = keyidx << 6; } static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, struct sk_buff *skb, int keylen, int keyidx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; u8 *newhdr; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) return NULL; hdrlen = ieee80211_hdrlen(hdr->frame_control); newhdr = skb_push(skb, IEEE80211_WEP_IV_LEN); memmove(newhdr, newhdr + IEEE80211_WEP_IV_LEN, hdrlen); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return newhdr + hdrlen; ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen); return newhdr + hdrlen; } static void ieee80211_wep_remove_iv(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; hdrlen = ieee80211_hdrlen(hdr->frame_control); memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); } /* Perform WEP encryption using given key. data buffer must have tailroom * for 4-byte ICV. data_len must not include this ICV. Note: this function * does _not_ add IV. data = RC4(data | CRC32(data)) */ int ieee80211_wep_encrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 icv; icv = cpu_to_le32(~crc32_le(~0, data, data_len)); put_unaligned(icv, (__le32 *)(data + data_len)); arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); return 0; } /* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the * beginning of the buffer 4 bytes of extra space (ICV) in the end of the * buffer will be added. Both IV and ICV will be transmitted, so the * payload length increases with 8 bytes. * * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) */ int ieee80211_wep_encrypt(struct ieee80211_local *local, struct sk_buff *skb, const u8 *key, int keylen, int keyidx) { u8 *iv; size_t len; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN)) return -1; iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); if (!iv) return -1; len = skb->len - (iv + IEEE80211_WEP_IV_LEN - skb->data); /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, iv, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key, keylen); /* Add room for ICV */ skb_put(skb, IEEE80211_WEP_ICV_LEN); return ieee80211_wep_encrypt_data(&local->wep_tx_ctx, rc4key, keylen + 3, iv + IEEE80211_WEP_IV_LEN, len); } /* Perform WEP decryption using given key. data buffer includes encrypted * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV. * Return 0 on success and -1 on ICV mismatch. */ int ieee80211_wep_decrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 crc; arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); crc = cpu_to_le32(~crc32_le(~0, data, data_len)); if (memcmp(&crc, data + data_len, IEEE80211_WEP_ICV_LEN) != 0) /* ICV mismatch */ return -1; return 0; } /* Perform WEP decryption on given skb. Buffer includes whole WEP part of * the frame: IV (4 bytes), encrypted payload (including SNAP header), * ICV (4 bytes). skb->len includes both IV and ICV. * * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload * is moved to the beginning of the skb and skb length will be reduced. */ static int ieee80211_wep_decrypt(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { u32 klen; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; u8 keyidx; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; size_t len; int ret = 0; if (!ieee80211_has_protected(hdr->frame_control)) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen + IEEE80211_WEP_IV_LEN + IEEE80211_WEP_ICV_LEN) return -1; len = skb->len - hdrlen - IEEE80211_WEP_IV_LEN - IEEE80211_WEP_ICV_LEN; keyidx = skb->data[hdrlen + 3] >> 6; if (!key || keyidx != key->conf.keyidx) return -1; klen = 3 + key->conf.keylen; /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, skb->data + hdrlen, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key->conf.key, key->conf.keylen); if (ieee80211_wep_decrypt_data(&local->wep_rx_ctx, rc4key, klen, skb->data + hdrlen + IEEE80211_WEP_IV_LEN, len)) ret = -1; /* Trim ICV */ skb_trim(skb, skb->len - IEEE80211_WEP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); return ret; } ieee80211_rx_result ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; if (!ieee80211_is_data(fc) && !ieee80211_is_auth(fc)) return RX_CONTINUE; if (!(status->flag & RX_FLAG_DECRYPTED)) { if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key)) return RX_DROP_U_WEP_DEC_FAIL; } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) { if (!pskb_may_pull(rx->skb, ieee80211_hdrlen(fc) + IEEE80211_WEP_IV_LEN)) return RX_DROP_U_NO_IV; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ if (!(status->flag & RX_FLAG_ICV_STRIPPED) && pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) return RX_DROP_U_NO_ICV; } return RX_CONTINUE; } static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_key_conf *hw_key = info->control.hw_key; if (!hw_key) { if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } else if ((hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) || (hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { if (!ieee80211_wep_add_iv(tx->local, skb, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } return 0; } ieee80211_tx_result ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (wep_encrypt_skb(tx, skb) < 0) { I802_DEBUG_INC(tx->local->tx_handlers_drop_wep); return TX_DROP; } } return TX_CONTINUE; } |
544 2911 3029 3027 3027 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef RQ_QOS_H #define RQ_QOS_H #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blk_types.h> #include <linux/atomic.h> #include <linux/wait.h> #include <linux/blk-mq.h> #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, }; struct rq_wait { wait_queue_head_t wait; atomic_t inflight; }; struct rq_qos { const struct rq_qos_ops *ops; struct gendisk *disk; enum rq_qos_id id; struct rq_qos *next; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; #endif }; struct rq_qos_ops { void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); void (*merge)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); void (*queue_depth_changed)(struct rq_qos *); void (*exit)(struct rq_qos *); const struct blk_mq_debugfs_attr *debugfs_attrs; }; struct rq_depth { unsigned int max_depth; int scale_step; bool scaled_max; unsigned int queue_depth; unsigned int default_depth; }; static inline struct rq_qos *rq_qos_id(struct request_queue *q, enum rq_qos_id id) { struct rq_qos *rqos; for (rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->id == id) break; } return rqos; } static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_WBT); } static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_LATENCY); } static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); init_waitqueue_head(&rq_wait->wait); } int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops); void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); bool rq_depth_scale_up(struct rq_depth *rqd); bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); void __rq_qos_done(struct rq_qos *rqos, struct request *rq); void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { if (q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { if (q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { if (q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { if (q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || bio_flagged(bio, BIO_QOS_MERGED))) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (q->rq_qos) __rq_qos_done_bio(q->rq_qos, bio); } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { if (q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { if (q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) { if (q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } void rq_qos_exit(struct request_queue *); #endif |
17418 17418 17412 11265 11268 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | // SPDX-License-Identifier: GPL-2.0 /* * trace context switch * * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> * */ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <trace/events/sched.h> #include "trace.h" #define RECORD_CMDLINE 1 #define RECORD_TGID 2 static int sched_cmdline_ref; static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) { int ret; ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { unregister_trace_sched_switch(probe_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(int ops) { bool sched_register; mutex_lock(&sched_register_mutex); sched_register = (!sched_cmdline_ref && !sched_tgid_ref); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref++; break; case RECORD_TGID: sched_tgid_ref++; break; } if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref--; break; case RECORD_TGID: sched_tgid_ref--; break; } if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(RECORD_CMDLINE); } void tracing_start_tgid_record(void) { tracing_start_sched_switch(RECORD_TGID); } void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } |
6 8 8 157 157 156 156 157 152 17 5 116 1 1 11 11 11 11 11 11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/switchdev.h> #include "br_private.h" static struct static_key_false br_switchdev_tx_fwd_offload; static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, const struct sk_buff *skb) { if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) return false; return (p->flags & BR_TX_FWD_OFFLOAD) && (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); } bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) return false; return BR_INPUT_SKB_CB(skb)->tx_fwd_offload; } void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) { skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb); } /* Mark the frame for TX forwarding offload if this egress port supports it */ void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, struct sk_buff *skb) { if (nbp_switchdev_can_offload_tx_fwd(p, skb)) BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true; } /* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms * that the skb has been already forwarded to, to avoid further cloning to * other ports in the same hwdom by making nbp_switchdev_allowed_egress() * return false. */ void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, struct sk_buff *skb) { if (nbp_switchdev_can_offload_tx_fwd(p, skb)) set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms); } void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { if (p->hwdom) BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom; } bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb); return !test_bit(p->hwdom, &cb->fwd_hwdoms) && (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom); } /* Flags that can be offloaded to hardware */ #define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | BR_PORT_MAB | \ BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \ BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST) int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->dev, }; struct switchdev_notifier_port_attr_info info = { .attr = &attr, }; int err; mask &= BR_PORT_FLAGS_HW_OFFLOAD; if (!mask) return 0; attr.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS; attr.u.brport_flags.val = flags; attr.u.brport_flags.mask = mask; /* We run from atomic context here */ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev, &info.info, extack); err = notifier_to_errno(err); if (err == -EOPNOTSUPP) return 0; if (err) { NL_SET_ERR_MSG_WEAK_MOD(extack, "bridge flag offload is not supported"); return -EOPNOTSUPP; } attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS; attr.flags = SWITCHDEV_F_DEFER; err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { NL_SET_ERR_MSG_WEAK_MOD(extack, "error setting offload flag on port"); return err; } return 0; } static void br_switchdev_fdb_populate(struct net_bridge *br, struct switchdev_notifier_fdb_info *item, const struct net_bridge_fdb_entry *fdb, const void *ctx) { const struct net_bridge_port *p = READ_ONCE(fdb->dst); item->addr = fdb->key.addr.addr; item->vid = fdb->key.vlan_id; item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); item->locked = false; item->info.dev = (!p || item->is_local) ? br->dev : p->dev; item->info.ctx = ctx; } void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type) { struct switchdev_notifier_fdb_info item; if (test_bit(BR_FDB_LOCKED, &fdb->flags)) return; /* Entries with these flags were created using ndm_state == NUD_REACHABLE, * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something * equivalent to 'bridge fdb add ... master dynamic (sticky)'. * Drivers don't know how to deal with these, so don't notify them to * avoid confusing them. */ if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) && !test_bit(BR_FDB_STATIC, &fdb->flags) && !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) return; br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) { case RTM_DELNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, item.info.dev, &item.info, NULL); break; case RTM_NEWNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, item.info.dev, &item.info, NULL); break; } } int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, bool changed, struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, .vid = vid, .changed = changed, }; return switchdev_port_obj_add(dev, &v.obj, extack); } int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, }; return switchdev_port_obj_del(dev, &v.obj); } static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining) { struct net_bridge *br = joining->br; struct net_bridge_port *p; int hwdom; /* joining is yet to be added to the port list. */ list_for_each_entry(p, &br->port_list, list) { if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) { joining->hwdom = p->hwdom; return 0; } } hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1); if (hwdom >= BR_HWDOM_MAX) return -EBUSY; set_bit(hwdom, &br->busy_hwdoms); joining->hwdom = hwdom; return 0; } static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving) { struct net_bridge *br = leaving->br; struct net_bridge_port *p; /* leaving is no longer in the port list. */ list_for_each_entry(p, &br->port_list, list) { if (p->hwdom == leaving->hwdom) return; } clear_bit(leaving->hwdom, &br->busy_hwdoms); } static int nbp_switchdev_add(struct net_bridge_port *p, struct netdev_phys_item_id ppid, bool tx_fwd_offload, struct netlink_ext_ack *extack) { int err; if (p->offload_count) { /* Prevent unsupported configurations such as a bridge port * which is a bonding interface, and the member ports are from * different hardware switches. */ if (!netdev_phys_item_id_same(&p->ppid, &ppid)) { NL_SET_ERR_MSG_MOD(extack, "Same bridge port cannot be offloaded by two physical switches"); return -EBUSY; } /* Tolerate drivers that call switchdev_bridge_port_offload() * more than once for the same bridge port, such as when the * bridge port is an offloaded bonding/team interface. */ p->offload_count++; return 0; } p->ppid = ppid; p->offload_count = 1; err = nbp_switchdev_hwdom_set(p); if (err) return err; if (tx_fwd_offload) { p->flags |= BR_TX_FWD_OFFLOAD; static_branch_inc(&br_switchdev_tx_fwd_offload); } return 0; } static void nbp_switchdev_del(struct net_bridge_port *p) { if (WARN_ON(!p->offload_count)) return; p->offload_count--; if (p->offload_count) return; if (p->hwdom) nbp_switchdev_hwdom_put(p); if (p->flags & BR_TX_FWD_OFFLOAD) { p->flags &= ~BR_TX_FWD_OFFLOAD; static_branch_dec(&br_switchdev_tx_fwd_offload); } } static int br_switchdev_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb, const struct net_bridge_fdb_entry *fdb, unsigned long action, const void *ctx) { struct switchdev_notifier_fdb_info item; int err; br_switchdev_fdb_populate(br, &item, fdb, ctx); err = nb->notifier_call(nb, action, &item); return notifier_to_errno(err); } static int br_switchdev_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; unsigned long action; int err = 0; if (!nb) return 0; if (!netif_is_bridge_master(br_dev)) return -EINVAL; br = netdev_priv(br_dev); if (adding) action = SWITCHDEV_FDB_ADD_TO_DEVICE; else action = SWITCHDEV_FDB_DEL_TO_DEVICE; rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { err = br_switchdev_fdb_replay_one(br, nb, fdb, action, ctx); if (err) break; } rcu_read_unlock(); return err; } static int br_switchdev_vlan_attr_replay(struct net_device *br_dev, const void *ctx, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_attr_info attr_info = { .info = { .dev = br_dev, .extack = extack, .ctx = ctx, }, }; struct net_bridge *br = netdev_priv(br_dev); struct net_bridge_vlan_group *vg; struct switchdev_attr attr; struct net_bridge_vlan *v; int err; attr_info.attr = &attr; attr.orig_dev = br_dev; vg = br_vlan_group(br); if (!vg) return 0; list_for_each_entry(v, &vg->vlan_list, vlist) { if (v->msti) { attr.id = SWITCHDEV_ATTR_ID_VLAN_MSTI; attr.u.vlan_msti.vid = v->vid; attr.u.vlan_msti.msti = v->msti; err = nb->notifier_call(nb, SWITCHDEV_PORT_ATTR_SET, &attr_info); err = notifier_to_errno(err); if (err) return err; } } return 0; } static int br_switchdev_vlan_replay_one(struct notifier_block *nb, struct net_device *dev, struct switchdev_obj_port_vlan *vlan, const void *ctx, unsigned long action, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, .ctx = ctx, }, .obj = &vlan->obj, }; int err; err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } static int br_switchdev_vlan_replay_group(struct notifier_block *nb, struct net_device *dev, struct net_bridge_vlan_group *vg, const void *ctx, unsigned long action, struct netlink_ext_ack *extack) { struct net_bridge_vlan *v; int err = 0; u16 pvid; if (!vg) return 0; pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct switchdev_obj_port_vlan vlan = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = br_vlan_flags(v, pvid), .vid = v->vid, }; if (!br_vlan_should_use(v)) continue; err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx, action, extack); if (err) return err; } return 0; } static int br_switchdev_vlan_replay(struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(br_dev); struct net_bridge_port *p; unsigned long action; int err; ASSERT_RTNL(); if (!nb) return 0; if (!netif_is_bridge_master(br_dev)) return -EINVAL; if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br), ctx, action, extack); if (err) return err; list_for_each_entry(p, &br->port_list, list) { struct net_device *dev = p->dev; err = br_switchdev_vlan_replay_group(nb, dev, nbp_vlan_group(p), ctx, action, extack); if (err) return err; } if (adding) { err = br_switchdev_vlan_attr_replay(br_dev, ctx, nb, extack); if (err) return err; } return 0; } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct br_switchdev_mdb_complete_info { struct net_bridge_port *port; struct br_ip ip; }; static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *priv) { struct br_switchdev_mdb_complete_info *data = priv; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; if (err) goto err; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); if (!mp) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port != port) continue; p->flags |= MDB_PG_FLAGS_OFFLOAD; } out: spin_unlock_bh(&br->multicast_lock); err: kfree(priv); } static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, const struct net_bridge_mdb_entry *mp) { if (mp->addr.proto == htons(ETH_P_IP)) ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); #if IS_ENABLED(CONFIG_IPV6) else if (mp->addr.proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); #endif else ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); mdb->vid = mp->addr.vid; } static void br_switchdev_host_mdb_one(struct net_device *dev, struct net_device *lower_dev, struct net_bridge_mdb_entry *mp, int type) { struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_HOST_MDB, .flags = SWITCHDEV_F_DEFER, .orig_dev = dev, }, }; br_switchdev_mdb_populate(&mdb, mp); switch (type) { case RTM_NEWMDB: switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); break; case RTM_DELMDB: switchdev_port_obj_del(lower_dev, &mdb.obj); break; } } static void br_switchdev_host_mdb(struct net_device *dev, struct net_bridge_mdb_entry *mp, int type) { struct net_device *lower_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, lower_dev, iter) br_switchdev_host_mdb_one(dev, lower_dev, mp, type); } static int br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, const struct switchdev_obj_port_mdb *mdb, unsigned long action, const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, .ctx = ctx, }, .obj = &mdb->obj, }; int err; err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } static int br_switchdev_mdb_queue_one(struct list_head *mdb_list, enum switchdev_obj_id id, const struct net_bridge_mdb_entry *mp, struct net_device *orig_dev) { struct switchdev_obj_port_mdb *mdb; mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC); if (!mdb) return -ENOMEM; mdb->obj.id = id; mdb->obj.orig_dev = orig_dev; br_switchdev_mdb_populate(mdb, mp); list_add_tail(&mdb->obj.list, mdb_list); return 0; } void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct br_switchdev_mdb_complete_info *complete_info; struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_PORT_MDB, .flags = SWITCHDEV_F_DEFER, }, }; if (!pg) return br_switchdev_host_mdb(dev, mp, type); br_switchdev_mdb_populate(&mdb, mp); mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); if (!complete_info) break; complete_info->port = pg->key.port; complete_info->ip = mp->addr; mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_switchdev_mdb_complete; if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL)) kfree(complete_info); break; case RTM_DELMDB: switchdev_port_obj_del(pg->key.port->dev, &mdb.obj); break; } } #endif static int br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING const struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; struct net_bridge *br; unsigned long action; LIST_HEAD(mdb_list); int err = 0; ASSERT_RTNL(); if (!nb) return 0; if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) return -EINVAL; br = netdev_priv(br_dev); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; /* We cannot walk over br->mdb_list protected just by the rtnl_mutex, * because the write-side protection is br->multicast_lock. But we * need to emulate the [ blocking ] calling context of a regular * switchdev event, so since both br->multicast_lock and RCU read side * critical sections are atomic, we have no choice but to pick the RCU * read side lock, queue up all our events, leave the critical section * and notify switchdev from blocking context. */ rcu_read_lock(); hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group __rcu * const *pp; const struct net_bridge_port_group *p; if (mp->host_joined) { err = br_switchdev_mdb_queue_one(&mdb_list, SWITCHDEV_OBJ_ID_HOST_MDB, mp, br_dev); if (err) { rcu_read_unlock(); goto out_free_mdb; } } for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { if (p->key.port->dev != dev) continue; err = br_switchdev_mdb_queue_one(&mdb_list, SWITCHDEV_OBJ_ID_PORT_MDB, mp, dev); if (err) { rcu_read_unlock(); goto out_free_mdb; } } } rcu_read_unlock(); if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; list_for_each_entry(obj, &mdb_list, list) { err = br_switchdev_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); if (err == -EOPNOTSUPP) err = 0; if (err) goto out_free_mdb; } out_free_mdb: list_for_each_entry_safe(obj, tmp, &mdb_list, list) { list_del(&obj->list); kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); } if (err) return err; #endif return 0; } static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; int err; err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); if (err) { /* -EOPNOTSUPP not propagated from MDB replay. */ return err; } err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); if (err && err != -EOPNOTSUPP) return err; return 0; } static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL); } /* Let the bridge know that this port is offloaded, so that it can assign a * switchdev hardware domain to it. */ int br_switchdev_port_offload(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack) { struct netdev_phys_item_id ppid; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) return err; err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack); if (err) return err; err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); if (err) goto out_switchdev_del; return 0; out_switchdev_del: nbp_switchdev_del(p); return err; } void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb); nbp_switchdev_del(p); } int br_switchdev_port_replay(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); } |
56 120 113 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 | /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <net/page_pool/types.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: * Return allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; struct page *page; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_pages(pool, gfp); } page = page_pool_alloc_frag(pool, offset, *size, gfp); if (unlikely(!page)) return NULL; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return page; } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: * Return allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: * Return the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) { return pool->p.dma_dir; } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { atomic_long_set(&page->pp_ref_count, nr); } static inline long page_pool_unref_page(struct page *page, long nr) { long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(&page->pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(&page->pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, &page->pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(&page->pp_ref_count, 1); return ret; } static inline void page_pool_ref_page(struct page *page) { atomic_long_inc(&page->pp_ref_count); } static inline bool page_pool_is_last_ref(struct page *page) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_page(page, 1) == 0; } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_is_last_ref(page)) return; page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct); #endif } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_page(pool, page, -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { dma_addr_t ret = page->dma_addr; if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { page->dma_addr = addr >> PAGE_SHIFT; /* We assume page alignment to shave off bottom bits, * if this "compression" doesn't work we need to drop. */ return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT; } page->dma_addr = addr; return false; } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } #endif /* _NET_PAGE_POOL_HELPERS_H */ |
11 424 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux NET3: Internet Group Management Protocol [IGMP] * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Extended to talk the BSD extended IGMP protocol of mrouted 3.6 */ #ifndef _LINUX_IGMP_H #define _LINUX_IGMP_H #include <linux/skbuff.h> #include <linux/timer.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/refcount.h> #include <linux/sockptr.h> #include <uapi/linux/igmp.h> static inline struct igmphdr *igmp_hdr(const struct sk_buff *skb) { return (struct igmphdr *)skb_transport_header(skb); } static inline struct igmpv3_report * igmpv3_report_hdr(const struct sk_buff *skb) { return (struct igmpv3_report *)skb_transport_header(skb); } static inline struct igmpv3_query * igmpv3_query_hdr(const struct sk_buff *skb) { return (struct igmpv3_query *)skb_transport_header(skb); } struct ip_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; __be32 sl_addr[] __counted_by(sl_max); }; #define IP_SFBLOCK 10 /* allocate this many at once */ /* ip_mc_socklist is real list now. Speed is not argument; this list never used in fast path code */ struct ip_mc_socklist { struct ip_mc_socklist __rcu *next_rcu; struct ip_mreqn multi; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ip_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip_sf_list { struct ip_sf_list *sf_next; unsigned long sf_count[2]; /* include/exclude counts */ __be32 sf_inaddr; unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ }; struct ip_mc_list { struct in_device *interface; __be32 multiaddr; unsigned int sfmode; struct ip_sf_list *sources; struct ip_sf_list *tomb; unsigned long sfcount[2]; union { struct ip_mc_list *next; struct ip_mc_list __rcu *next_rcu; }; struct ip_mc_list __rcu *next_hash; struct timer_list timer; int users; refcount_t refcnt; spinlock_t lock; char tm_running; char reporter; char unsolicit_count; char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; struct rcu_head rcu; }; /* V3 exponential field decoding */ #define IGMPV3_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) #define IGMPV3_EXP(thresh, nbmant, nbexp, value) \ ((value) < (thresh) ? (value) : \ ((IGMPV3_MASK(value, nbmant) | (1<<(nbmant))) << \ (IGMPV3_MASK((value) >> (nbmant), nbexp) + (nbexp)))) #define IGMPV3_QQIC(value) IGMPV3_EXP(0x80, 4, 3, value) #define IGMPV3_MRC(value) IGMPV3_EXP(0x80, 4, 3, value) static inline int ip_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ip_transport_len(skb) < len) return 0; return pskb_may_pull(skb, len); } extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto); extern int igmp_rcv(struct sk_buff *); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t offset); extern int ip_mc_sf_allow(const struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); extern void ip_mc_destroy_dev(struct in_device *); extern void ip_mc_up(struct in_device *); extern void ip_mc_down(struct in_device *); extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); static inline void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) { return __ip_mc_dec_group(in_dev, addr, GFP_KERNEL); } extern void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); int ip_mc_check_igmp(struct sk_buff *skb); #endif |
54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "timers.h" #include "device.h" #include "peer.h" #include "queueing.h" #include "socket.h" /* * - Timer for retransmitting the handshake if we don't hear back after * `REKEY_TIMEOUT + jitter` ms. * * - Timer for sending empty packet if we have received a packet but after have * not sent one for `KEEPALIVE_TIMEOUT` ms. * * - Timer for initiating new handshake if we have sent a packet but after have * not received one (even empty) for `(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) + * jitter` ms. * * - Timer for zeroing out all ephemeral keys after `(REJECT_AFTER_TIME * 3)` ms * if no new keys have been received. * * - Timer for, if enabled, sending an empty authenticated packet every user- * specified seconds. */ static inline void mod_peer_timer(struct wg_peer *peer, struct timer_list *timer, unsigned long expires) { rcu_read_lock_bh(); if (likely(netif_running(peer->device->dev) && !READ_ONCE(peer->is_dead))) mod_timer(timer, expires); rcu_read_unlock_bh(); } static void wg_expired_retransmit_handshake(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_retransmit_handshake); if (peer->timer_handshake_attempts > MAX_TIMER_HANDSHAKES) { pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d attempts, giving up\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)MAX_TIMER_HANDSHAKES + 2); del_timer(&peer->timer_send_keepalive); /* We drop all packets without a keypair and don't try again, * if we try unsuccessfully for too long to make a handshake. */ wg_packet_purge_staged_packets(peer); /* We set a timer for destroying any residue that might be left * of a partial exchange. */ if (!timer_pending(&peer->timer_zero_key_material)) mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } else { ++peer->timer_handshake_attempts; pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d seconds, retrying (try %d)\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REKEY_TIMEOUT, peer->timer_handshake_attempts + 1); /* We clear the endpoint address src address, in case this is * the cause of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, true); } } static void wg_expired_send_keepalive(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_send_keepalive); wg_packet_send_keepalive(peer); if (peer->timer_need_another_keepalive) { peer->timer_need_another_keepalive = false; mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); } } static void wg_expired_new_handshake(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_new_handshake); pr_debug("%s: Retrying handshake with peer %llu (%pISpfsc) because we stopped hearing back after %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT)); /* We clear the endpoint address src address, in case this is the cause * of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, false); } static void wg_expired_zero_key_material(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_zero_key_material); rcu_read_lock_bh(); if (!READ_ONCE(peer->is_dead)) { wg_peer_get(peer); if (!queue_work(peer->device->handshake_send_wq, &peer->clear_peer_work)) /* If the work was already on the queue, we want to drop * the extra reference. */ wg_peer_put(peer); } rcu_read_unlock_bh(); } static void wg_queued_expired_zero_key_material(struct work_struct *work) { struct wg_peer *peer = container_of(work, struct wg_peer, clear_peer_work); pr_debug("%s: Zeroing out all keys for peer %llu (%pISpfsc), since we haven't received a new one in %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REJECT_AFTER_TIME * 3); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_peer_put(peer); } static void wg_expired_send_persistent_keepalive(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_persistent_keepalive); if (likely(peer->persistent_keepalive_interval)) wg_packet_send_keepalive(peer); } /* Should be called after an authenticated data packet is sent. */ void wg_timers_data_sent(struct wg_peer *peer) { if (!timer_pending(&peer->timer_new_handshake)) mod_peer_timer(peer, &peer->timer_new_handshake, jiffies + (KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after an authenticated data packet is received. */ void wg_timers_data_received(struct wg_peer *peer) { if (likely(netif_running(peer->device->dev))) { if (!timer_pending(&peer->timer_send_keepalive)) mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); else peer->timer_need_another_keepalive = true; } } /* Should be called after any type of authenticated packet is sent, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_sent(struct wg_peer *peer) { del_timer(&peer->timer_send_keepalive); } /* Should be called after any type of authenticated packet is received, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_received(struct wg_peer *peer) { del_timer(&peer->timer_new_handshake); } /* Should be called after a handshake initiation message is sent. */ void wg_timers_handshake_initiated(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_retransmit_handshake, jiffies + REKEY_TIMEOUT * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after a handshake response message is received and processed * or when getting key confirmation via the first data message. */ void wg_timers_handshake_complete(struct wg_peer *peer) { del_timer(&peer->timer_retransmit_handshake); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; ktime_get_real_ts64(&peer->walltime_last_handshake); } /* Should be called after an ephemeral key is created, which is before sending a * handshake response or after receiving a handshake response. */ void wg_timers_session_derived(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } /* Should be called before a packet with authentication, whether * keepalive, data, or handshakem is sent, or after one is received. */ void wg_timers_any_authenticated_packet_traversal(struct wg_peer *peer) { if (peer->persistent_keepalive_interval) mod_peer_timer(peer, &peer->timer_persistent_keepalive, jiffies + peer->persistent_keepalive_interval * HZ); } void wg_timers_init(struct wg_peer *peer) { timer_setup(&peer->timer_retransmit_handshake, wg_expired_retransmit_handshake, 0); timer_setup(&peer->timer_send_keepalive, wg_expired_send_keepalive, 0); timer_setup(&peer->timer_new_handshake, wg_expired_new_handshake, 0); timer_setup(&peer->timer_zero_key_material, wg_expired_zero_key_material, 0); timer_setup(&peer->timer_persistent_keepalive, wg_expired_send_persistent_keepalive, 0); INIT_WORK(&peer->clear_peer_work, wg_queued_expired_zero_key_material); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; peer->timer_need_another_keepalive = false; } void wg_timers_stop(struct wg_peer *peer) { timer_delete_sync(&peer->timer_retransmit_handshake); timer_delete_sync(&peer->timer_send_keepalive); timer_delete_sync(&peer->timer_new_handshake); timer_delete_sync(&peer->timer_zero_key_material); timer_delete_sync(&peer->timer_persistent_keepalive); flush_work(&peer->clear_peer_work); } |
1962 3120 25 3121 3121 3121 3119 3031 3028 26 1095 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pdata_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest * @buf: target buffer * @buflen: target buffer length * * Format the path of the cgroup of @blkg into @buf. */ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */ |
2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* * include/net/tipc.h: Include file for TIPC message header routines * * Copyright (c) 2017 Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_HDR_H #define _TIPC_HDR_H #include <linux/random.h> #define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */ struct tipc_basic_hdr { __be32 w[4]; }; static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) { u32 w0 = ntohl(hdr->w[0]); bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; __be32 key; /* Return source node identity as key */ if (likely(!keepalive_msg)) return hdr->w[3]; /* Spread PROBE/PROBE_REPLY messages across the cores */ get_random_bytes(&key, sizeof(key)); return key; } #endif |
10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Internal header to deal with irq_desc->status which will be renamed * to irq_desc->settings. */ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, _IRQ_PER_CPU = IRQ_PER_CPU, _IRQ_LEVEL = IRQ_LEVEL, _IRQ_NOPROBE = IRQ_NOPROBE, _IRQ_NOREQUEST = IRQ_NOREQUEST, _IRQ_NOTHREAD = IRQ_NOTHREAD, _IRQ_NOAUTOEN = IRQ_NOAUTOEN, _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQ_HIDDEN = IRQ_HIDDEN, _IRQ_NO_DEBUG = IRQ_NO_DEBUG, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; #define IRQ_PER_CPU GOT_YOU_MORON #define IRQ_NO_BALANCING GOT_YOU_MORON #define IRQ_LEVEL GOT_YOU_MORON #define IRQ_NOPROBE GOT_YOU_MORON #define IRQ_NOREQUEST GOT_YOU_MORON #define IRQ_NOTHREAD GOT_YOU_MORON #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #define IRQ_HIDDEN GOT_YOU_MORON #define IRQ_NO_DEBUG GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON static inline void irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) { desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); } static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_PER_CPU; } static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_PER_CPU_DEVID; } static inline void irq_settings_set_per_cpu(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_PER_CPU; } static inline void irq_settings_set_no_balancing(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_NO_BALANCING; } static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NO_BALANCING; } static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) { return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; } static inline void irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) { desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; } static inline bool irq_settings_is_level(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_LEVEL; } static inline void irq_settings_clr_level(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_LEVEL; } static inline void irq_settings_set_level(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_LEVEL; } static inline bool irq_settings_can_request(struct irq_desc *desc) { return !(desc->status_use_accessors & _IRQ_NOREQUEST); } static inline void irq_settings_clr_norequest(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_NOREQUEST; } static inline void irq_settings_set_norequest(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_NOREQUEST; } static inline bool irq_settings_can_thread(struct irq_desc *desc) { return !(desc->status_use_accessors & _IRQ_NOTHREAD); } static inline void irq_settings_clr_nothread(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_NOTHREAD; } static inline void irq_settings_set_nothread(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_NOTHREAD; } static inline bool irq_settings_can_probe(struct irq_desc *desc) { return !(desc->status_use_accessors & _IRQ_NOPROBE); } static inline void irq_settings_clr_noprobe(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_NOPROBE; } static inline void irq_settings_set_noprobe(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_NOPROBE; } static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; } static inline bool irq_settings_can_autoenable(struct irq_desc *desc) { return !(desc->status_use_accessors & _IRQ_NOAUTOEN); } static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NESTED_THREAD; } static inline bool irq_settings_is_polled(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_IS_POLLED; } static inline bool irq_settings_disable_unlazy(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY; } static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; } static inline bool irq_settings_is_hidden(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_HIDDEN; } static inline void irq_settings_set_no_debug(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_NO_DEBUG; } static inline bool irq_settings_no_debug(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NO_DEBUG; } |
11 12 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 15 12 12 12 3 2 15 3 3 17 16 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <net/sock.h> #include <linux/netlink.h> #include <linux/sock_diag.h> #include <linux/netlink_diag.h> #include <linux/rhashtable.h> #include "af_netlink.h" static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->groups == NULL) return 0; return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups), nlk->groups); } static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); u32 flags = 0; if (nlk->cb_running) flags |= NDIAG_FLAG_CB_RUNNING; if (nlk_test_bit(RECV_PKTINFO, sk)) flags |= NDIAG_FLAG_PKTINFO; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) flags |= NDIAG_FLAG_BROADCAST_ERROR; if (nlk_test_bit(RECV_NO_ENOBUFS, sk)) flags |= NDIAG_FLAG_NO_ENOBUFS; if (nlk_test_bit(LISTEN_ALL_NSID, sk)) flags |= NDIAG_FLAG_LISTEN_ALL_NSID; if (nlk_test_bit(CAP_ACK, sk)) flags |= NDIAG_FLAG_CAP_ACK; return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct netlink_diag_msg *rep; struct netlink_sock *nlk = nlk_sk(sk); nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->ndiag_family = AF_NETLINK; rep->ndiag_type = sk->sk_type; rep->ndiag_protocol = sk->sk_protocol; rep->ndiag_state = sk->sk_state; rep->ndiag_ino = sk_ino; rep->ndiag_portid = nlk->portid; rep->ndiag_dst_portid = nlk->dst_portid; rep->ndiag_dst_group = nlk->dst_group; sock_diag_save_cookie(sk, rep->ndiag_cookie); if ((req->ndiag_show & NDIAG_SHOW_GROUPS) && sk_diag_dump_groups(sk, skb)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_FLAGS) && sk_diag_put_flags(sk, skb)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { struct rhashtable_iter *hti = (void *)cb->args[2]; struct netlink_table *tbl = &nl_table[protocol]; struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; unsigned long flags; struct sock *sk; int num = 2; int ret = 0; req = nlmsg_data(cb->nlh); if (s_num > 1) goto mc_list; num--; if (!hti) { hti = kmalloc(sizeof(*hti), GFP_KERNEL); if (!hti) return -ENOMEM; cb->args[2] = (long)hti; } if (!s_num) rhashtable_walk_enter(&tbl->hash, hti); rhashtable_walk_start(hti); while ((nlsk = rhashtable_walk_next(hti))) { if (IS_ERR(nlsk)) { ret = PTR_ERR(nlsk); if (ret == -EAGAIN) { ret = 0; continue; } break; } sk = (struct sock *)nlsk; if (!net_eq(sock_net(sk), net)) continue; if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sock_i_ino(sk)) < 0) { ret = 1; break; } } rhashtable_walk_stop(hti); if (ret) goto done; rhashtable_walk_exit(hti); num++; mc_list: read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { num++; continue; } if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, __sock_i_ino(sk)) < 0) { ret = 1; break; } num++; } read_unlock_irqrestore(&nl_table_lock, flags); done: cb->args[0] = num; return ret; } static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_diag_req *req; int s_num = cb->args[0]; int err = 0; req = nlmsg_data(cb->nlh); if (req->sdiag_protocol == NDIAG_PROTO_ALL) { int i; for (i = cb->args[1]; i < MAX_LINKS; i++) { err = __netlink_diag_dump(skb, cb, i, s_num); if (err) break; s_num = 0; } cb->args[1] = i; } else { if (req->sdiag_protocol >= MAX_LINKS) return -ENOENT; err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } return err < 0 ? err : skb->len; } static int netlink_diag_dump_done(struct netlink_callback *cb) { struct rhashtable_iter *hti = (void *)cb->args[2]; if (cb->args[0] == 1) rhashtable_walk_exit(hti); kfree(hti); return 0; } static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct netlink_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = netlink_diag_dump, .done = netlink_diag_dump_done, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else return -EOPNOTSUPP; } static const struct sock_diag_handler netlink_diag_handler = { .family = AF_NETLINK, .dump = netlink_diag_handler_dump, }; static int __init netlink_diag_init(void) { return sock_diag_register(&netlink_diag_handler); } static void __exit netlink_diag_exit(void) { sock_diag_unregister(&netlink_diag_handler); } module_init(netlink_diag_init); module_exit(netlink_diag_exit); MODULE_DESCRIPTION("Netlink-based socket monitoring/diagnostic interface (sock_diag)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */); |
5617 5617 5617 5618 30 30 29 1 5617 5617 5617 5617 396 371 58 58 58 58 97 97 97 38 59 58 58 58 58 58 58 35 58 58 58 58 58 58 58 58 37 95 1721 1196 1197 1197 1152 47 1197 24 4874 4874 4875 4718 4944 2 4944 4944 4944 3365 1799 1800 4934 2129 2130 1456 795 2129 353 353 206 206 353 1860 3447 3392 4944 215 215 2750 2122 766 764 2732 2732 2731 971 2732 1404 2732 117 2726 2721 2721 1596 1522 1522 1522 2726 2726 581 581 581 369 369 369 365 4 3 308 4 4 2405 2405 2405 75 2404 2405 3 2405 2405 75 75 75 6 75 2404 2195 2195 2330 2330 2330 2330 2330 2330 1447 1445 1447 1447 1423 1446 1446 1446 1446 1446 1422 1424 6 1412 210 194 24 1446 24 10 1446 13 1446 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 1404 1404 1404 12 1404 89 1641 89 89 89 1274 1269 1223 1223 1223 1220 1220 1219 1220 1220 1403 1404 1422 986 233 10 1375 1085 305 1374 1404 10 10 10 10 12 1424 1424 1424 230 1424 1198 1204 1204 764 764 1204 1199 1199 270 1231 1231 1173 970 970 970 970 1270 1269 1268 1233 1233 1215 1215 292 292 1269 1140 1149 1148 1149 1149 1149 1150 1150 1150 1150 1150 1148 1149 1058 1171 1171 1165 1171 1150 946 946 1150 1170 1171 2 2 2 2 1149 1058 1057 1043 1028 1043 1043 1199 1291 1291 1291 1291 1044 1280 222 222 1280 1280 25 1270 137 1270 1270 1270 1270 1270 1016 249 1274 1319 1319 1318 1318 1097 1318 1318 1318 1318 1318 1096 1306 1291 1257 1206 1199 1191 1191 1186 1087 1087 2 1073 2 2 1073 1118 192 1103 279 8 1104 1319 1319 1104 1311 31 1447 1447 1447 1447 1447 4 1446 1447 1447 12 12 12 12 1446 1389 1446 1389 1446 1446 54 1446 233 231 1446 1446 3 1445 1445 433 222 112 112 615 615 399 939 939 940 1492 1438 1492 212 1470 1491 1369 1492 1491 1492 1492 669 1492 41 41 1482 1482 1481 61 1526 31 4 1523 1526 1386 1527 1527 1526 1527 1526 20 1580 1578 1579 1580 1580 98 75 14 1396 1451 14 14 14 1275 1275 61 3 3 3 62 62 62 62 62 57 62 62 3 2165 2165 2165 2165 96 96 96 57 96 59 32 74 96 96 8 96 28 28 83 83 3 82 2 8 79 83 60 1201 85 85 37 2 1257 1257 1257 14 34 21 34 34 34 19 34 34 33 33 22 34 34 34 33 21 12 33 34 34 34 34 34 36 3721 3721 2144 2144 1201 1201 1201 28 1012 1199 1199 1199 1007 311 1186 1186 1186 1185 1185 1183 5617 5617 5617 5617 5616 1554 5617 5617 5617 5617 5617 5617 5616 5617 5617 5617 5617 5616 5617 5617 5617 5616 5617 5616 5618 5618 5616 5 5 5 5 5 5 5 5 5617 5618 30 30 5616 5617 2234 2234 2163 2388 2388 3 2388 12 2388 5 2388 2388 2387 2388 2388 2388 30 29 29 29 30 95 95 95 95 95 9 9 9 9 95 95 95 95 95 83 30 30 30 30 30 30 30 30 29 30 1 1 30 30 30 1 29 30 30 30 30 7 29 6 30 30 30 1 30 30 30 30 30 30 30 30 30 30 1 29 29 30 29 1 1 1 1 30 30 30 2211 5617 5617 5616 5617 5617 5617 28 28 28 28 28 375 1176 1176 1176 1121 1176 1176 1176 283 1176 1176 376 376 1176 1176 376 375 375 375 375 375 375 1176 1143 1126 1126 1126 314 314 1121 1585 1585 1164 1164 1164 1 1164 1 1164 1164 1 1164 2 1164 1164 1164 1164 1164 811 2820 3335 3335 1196 3335 3335 1321 1321 1960 5615 5617 5615 5616 5617 5616 5617 5608 5607 5608 5608 5608 4274 4270 22 22 22 18 22 22 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(sbi, csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode, 0); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { if (ext4_has_feature_journal(inode->i_sb) && (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. if * create==0 and the blocks are pre-allocated and unwritten, the resulting @map * is marked as unwritten. If the create == 1, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif goto found; } /* * In the query cache no-wait mode, nothing we can do more if we * cannot find extent in the cache. */ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return 0; /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_get_block() returns the create = 0 * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); /* * We need to check for EXT4 here because migrate * could have changed the inode type in between */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { /* * We allocated new blocks which will result in * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } } if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { retval = ret; goto out_sem; } } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) goto out_sem; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } out_sem: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = (loff_t)map->m_lblk << inode->i_blkbits; loff_t length = (loff_t)map->m_len << inode->i_blkbits; if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || map->m_flags & EXT4_MAP_MAPPED)) ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_page) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ old_state = READ_ONCE(bh->b_state); do { new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); /* * If the buffer is marked unwritten, mark it as new to make sure it is * zeroed out correctly in case of partial writes. Otherwise, there is * a chance of stale data getting exposed. */ if (ret == 0 && buffer_unwritten(bh_result)) set_buffer_new(bh_result); return ret; } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); bh = sb_getblk(inode->i_sb, map.m_pblk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * Helper for handling dirtying of journalled data. We also mark the folio as * dirty so that writeback code knows about this page (and inode) contains * dirty data. ext4_writepages() then commits appropriate transaction to * make data stable. */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { folio_mark_dirty(bh->b_folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* * __block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit * by __block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ if (dirty) clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (!ret && dirty) ret = ext4_dirty_journalled_data(handle, bh); return ret; } #ifdef CONFIG_FS_ENCRYPTION static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize = inode->i_sb->s_blocksize; unsigned bbits; struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } #endif /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct folio *folio; pgoff_t index; unsigned from, to; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; from = pos & (PAGE_SIZE - 1); to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, pagep); if (ret < 0) return ret; if (ret == 1) return 0; } /* * __filemap_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { folio_put(folio); return PTR_ERR(handle); } folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); folio_unlock(folio); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; folio_put(folio); return ret; } *pagep = &folio->page; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; folio_zero_range(folio, start, size); write_end_fn(handle, inode, bh); } clear_buffer_new(bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for a single cluster */ static int ext4_da_reserve_space(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; trace_ext4_da_reserve_space(inode); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ /* * Extent to map - this can be after first_page because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { unsigned nr, i; pgoff_t index, end; struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_page == 0. */ if (mpd->first_page >= mpd->next_page) return; mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; start = index << (PAGE_SHIFT - inode->i_blkbits); last = end << (PAGE_SHIFT - inode->i_blkbits); /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start + 1); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); while (index <= end) { nr = filemap_get_folios(mapping, &index, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; if (folio->index < mpd->first_page) continue; if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { if (folio_mapped(folio)) folio_clear_dirty_for_io(folio); block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); } folio_unlock(folio); } folio_batch_release(&fbatch); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block * count or making a pending reservation * where needed * * @inode - file containing the newly added block * @lblk - logical block to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; /* * If the cluster containing lblk is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) { ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) return ret; if (ret == 0) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ return ret; } else { allocated = true; } } else { allocated = true; } } } ext4_es_insert_delayed_block(inode, lblk, allocated); return 0; } /* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write * time. This function looks up the requested blocks and sets the * buffer delay bit under the protection of i_data_sem. */ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct ext4_map_blocks *map, struct buffer_head *bh) { struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); goto add_delayed; } /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; retval = es.es_len - (iblock - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return retval; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); add_delayed: if (retval == 0) { int ret; /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ ret = ext4_insert_delayed_block(inode, map->m_lblk); if (ret != 0) { retval = ret; goto out_unlock; } map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); } else if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } out_unlock: up_read((&EXT4_I(inode)->i_data_sem)); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, iblock, &map, bh); if (ret <= 0) return ret; map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->first_page += folio_nr_pages(folio); folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { size_t len; loff_t size; int err; BUG_ON(folio->index != mpd->first_page); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_folio() to zero-out tail of the written page. We rely * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_folio - update folio buffers corresponding to changed extent * and may submit fully mapped page for IO * @mpd: description of extent to map, on return next extent to map * @folio: Contains these buffers. * @m_lblk: logical block mapping. * @m_pblk: corresponding physical mapping. * @map_bh: determines on return whether this page requires any further * mapping or not. * * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given folio is not fully mapped, we update @mpd to the next extent in * the given folio that needs mapping & return @map_bh as true. */ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); while (start <= end) { nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: folio_batch_release(&fbatch); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. * * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if * the blocks in question are delalloc blocks. This indicates * that the blocks and quotas has already been checked when * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (map->m_flags & BIT(BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { if (progress) goto update_disksize; return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { struct buffer_head *page_bufs = folio_buffers(folio); struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; return ret; } static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); size_t len = folio_size(folio); folio_clear_checked(folio); mpd->wbc->nr_to_write--; if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) len = size - folio_pos(folio); return ext4_journal_folio_buffers(handle, folio, len); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. If we cannot map blocks, we submit just already mapped * buffers in the page for IO and keep page dirty. When we can map blocks and * we find a page which isn't mapped we start accumulating extent of buffers * underlying these pages that needs mapping (formed by either delayed or * unwritten buffers). We also lock the pages containing these buffers. The * extent found is returned in @mpd structure (starting at mpd->lblk with * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; mpd->next_page = index; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; if (handle) { err = ext4_journal_ensure_credits(handle, bpp, 0); if (err < 0) goto out; } folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!folio_test_dirty(folio) || (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } folio_wait_writeback(folio); BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!folio_buffers(folio)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); folio_clear_dirty(folio); folio_unlock(folio); continue; } if (mpd->map.m_len == 0) mpd->first_page = folio->index; mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This * makes sure current data is checkpointed to the final * location before possibly journalling it again which * is desirable when the page is frequently dirtied * through a pin. */ if (!mpd->can_map) { err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; } } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; if (handle) ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); if (handle) ext4_journal_stop(handle); return err; } static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } /* * data=journal mode does not do delalloc so we just need to writeout / * journal already mapped buffers. On the other hand we need to commit * transaction to make data stable. We expect all the data to be * already in the journal (the only exception are DMA pinned pages * dirtied behind our back) so we commit transaction here and run the * writeback loop to checkpoint them. The checkpointing is not actually * necessary to make data persistent *but* quite a few places (extent * shifting operations, fsverity, ...) depend on being able to drop * pagecache pages after calling filemap_write_and_wait() and for that * checkpointing needs to happen. */ if (ext4_should_journal_data(inode)) { mpd->can_map = 0; if (wbc->sync_mode == WB_SYNC_ALL) ext4_fc_commit(sbi->s_journal, EXT4_I(inode)->i_datasync_tid); } mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, PAGE_SIZE >> inode->i_blkbits); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd->first_page = writeback_index; mpd->last_page = -1; } else { mpd->first_page = wbc->range_start >> PAGE_SHIFT; mpd->last_page = wbc->range_end >> PAGE_SHIFT; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd->first_page, mpd->last_page); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd->do_map = 0; mpd->scanned_until_end = 0; mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); ext4_put_io_end_defer(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; break; } mpd->do_map = 1; trace_ext4_da_write_pages(inode, mpd->first_page, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd->do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd->last_page = writeback_index - 1; mpd->first_page = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; int ret; int alloc_ctx; if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked * for delayed dirtying (PageChecked) which were just added to the * running transaction. Try once more to get them to stable storage. */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); ext4_writepages_up_read(sb, alloc_ctx); return ret; } int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; return ext4_do_writepages(&mpd); } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; int alloc_ctx; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, pagep, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, pagep, fsdata); if (ret < 0) return ret; if (ret == 1) return 0; } retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { folio_unlock(folio); folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *pagep = &folio->page; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; struct inode *inode = folio->mapping->host; unsigned int idx; int i; bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; loff_t new_i_size; /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, &folio->page, NULL); new_i_size = pos + copied; /* * It's important to update i_size while still holding folio lock, * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range up to i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize up to i_size * eventually. If the end of the current write is > i_size and * inside an allocated block which ext4_da_should_update_i_disksize() * checked, we need to update i_disksize here as certain * ext4_writepages() paths not allocating blocks and update i_disksize. */ if (new_i_size > inode->i_size) { unsigned long end; i_size_write(inode, new_i_size); end = (new_i_size - 1) & (PAGE_SIZE - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } folio_unlock(folio); folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); if (disksize_changed) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } return copied; } static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, len, copied, &folio->page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t ret = 0; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && (test_opt(inode->i_sb, DELALLOC) || ext4_should_journal_data(inode))) { /* * With delalloc or journalled data we want to sync the file so * that we can make sure we allocate blocks for file and data * is in place for the user to see it */ filemap_write_and_wait(mapping); } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) { int ret = -EAGAIN; struct inode *inode = folio->mapping->host; trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, folio); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); block_invalidate_folio(folio, offset, length); } static int __ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { journal_t *journal = EXT4_JOURNAL(folio->mapping->host); trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == folio_size(folio)) folio_clear_checked(folio); return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static bool ext4_release_folio(struct folio *folio, gfp_t wait) { struct inode *inode = folio->mapping->host; journal_t *journal = EXT4_JOURNAL(inode); trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) return false; if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) goto out; } ret = ext4_iomap_alloc(inode, &map, flags); } else { ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code so that we * fallback to buffered I/O and attempt to complete the remainder of * the I/O. Any blocks that may have been allocated in preparation for * the direct I/O will be reused during buffered I/O. */ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) return -ENOTBLK; return 0; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, .iomap_end = ext4_iomap_end, }; static bool ext4_iomap_is_delalloc(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; ext4_es_find_extent_range(inode, &ext4_es_is_delayed, map->m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) return false; if (es.es_lblk > map->m_lblk) { map->m_len = es.es_lblk - map->m_lblk; return false; } offset = map->m_lblk - es.es_lblk; map->m_len = es.es_len - offset; return true; } static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret == 0) delalloc = ext4_iomap_is_delalloc(inode, &map); set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); if (delalloc && iomap->type == IOMAP_HOLE) iomap->type = IOMAP_DELALLOC; return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the * transaction and marked as jbddirty (we take care of this in * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings * so we should have nothing to do here, except for the case when someone * had the page pinned and dirtied the page through this pin (e.g. by doing * direct IO to it). In that case we'd need to attach buffers here to the * transaction but we cannot due to lock ordering. We cannot just dirty the * folio and leave attached buffers clean, because the buffers' dirty state is * "definitive". We cannot just set the buffers dirty or jbddirty because all * the journalling code will explode. So what we do is to mark the folio * "pending dirty" and next time ext4_writepages() is called, attach buffers * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); if (folio_maybe_dma_pinned(folio)) folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); WARN_ON_ONCE(!folio_buffers(folio)); return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } /* * Here we can't skip an unwritten buffer even though it usually reads zero * because it might have data in pagecache (eg, if called from ext4_zero_range, * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct folio *folio; int err = 0; folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: folio_unlock(folio); folio_put(folio); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen after we truncate page cache. */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { struct page *page; int error; if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { page = dax_layout_busy_page(inode->i_mapping); if (!page) return 0; error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, ext4_wait_dax_page(inode)); } while (error == 0); return error; } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; loff_t first_block_offset, last_block_offset, max_length; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions * Then release them. */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) return ret; } inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) goto out_mutex; /* * If the hole extends beyond i_size, set the hole * to end after the page that contains i_size */ if (offset + length > inode->i_size) { length = inode->i_size + PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - offset; } /* * For punch hole the length + offset needs to be within one block * before last range. Adjust the length if it goes beyond that limit. */ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; if (offset + length > max_length) length = max_length - offset; if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* * Attach jinode to inode for jbd2 if we do any zeroing of * partial block */ ret = ext4_inode_attach_jinode(inode); if (ret < 0) goto out_mutex; } /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_dio; first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ if (last_block_offset > first_block_offset) { ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); goto out_dio; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_stop; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); /* If there are blocks to remove, do it */ if (stop_block > first_block) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, first_block, stop_block - first_block); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, stop_block - 1); else ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { struct ext4_inode_info *ei = EXT4_I(inode); uid_t i_uid; gid_t i_gid; projid_t i_projid; int block; int err; err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ext4_isize_set(raw_inode, ei->i_disksize); raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If we pass 'inode' and it does not * have in-inode xattr, we have all inode data in memory that is needed * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { struct ext4_inode *raw_inode = (struct ext4_inode *) (bh->b_data + iloc->offset); /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) { if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "missing EA_INODE flag"; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) return "ea_inode with extended attributes"; } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "unexpected EA_INODE flag"; } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) return "unexpected bad inode w/o EXT4_IGET_BAD"; return NULL; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); iput(inode); return ERR_PTR(-EFSCORRUPTED); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_ATIME(inode, raw_inode); EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); } else { inode->i_op = &ext4_symlink_inode_operations; } } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; goto bad_inode; } brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err; int need_datasync = 0, set_large_file = 0; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = 0; int ret; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); folio_unlock(folio); folio_put(folio); if (ret != -EBUSY) return; commit_tid = 0; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) commit_tid = journal->j_committing_transaction->t_tid; read_unlock(&journal->j_state_lock); if (commit_tid) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_rwsem down. */ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(idmap, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { ext4_journal_stop(handle); return error; } /* Update corresponding info in inode so that everything is in * one transaction */ i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { return error; } } if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { return -EINVAL; } if (attr->ia_size == inode->i_size) inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, attr->ia_size); if (error) goto err_out; } /* * Blocks are going to be removed from the inode. Wait * for dio in flight. */ inode_dio_wait(inode); } filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { filemap_invalidate_unlock(inode->i_mapping); goto err_out; } if (attr->ia_size != inode->i_size) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_mmap_sem; } if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } /* * Update c/mtime on truncate up, ext4_truncate() will * update c/mtime in shrink case below */ if (!shrink) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (shrink) ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) error = rc; /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code * running ext4_wb_update_i_disksize(). */ if (!error) i_size_write(inode, attr->ia_size); else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) goto out_mmap_sem; if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); } else if (ext4_should_journal_data(inode)) { ext4_wait_for_tail_page_commit(inode); } } /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); /* * Call ext4_truncate() even if i_size didn't change to * truncate possible preallocated blocks. */ if (attr->ia_size <= oldsize) { rc = ext4_truncate(inode); if (rc) error = rc; } out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); } if (!error) { if (inc_ivers) inode_inc_iversion(inode); setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } /* * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) ext4_std_error(inode->i_sb, error); if (!error) error = rc; return error; } u32 ext4_dio_alignment(struct inode *inode) { if (fsverity_active(inode)) return 0; if (ext4_should_journal_data(inode)) return 0; if (ext4_has_inline_data(inode)) return 0; if (IS_ENCRYPTED(inode)) { if (!fscrypt_dio_supported(inode)) return 0; return i_blocksize(inode); } return 1; /* use the iomap defaults */ } int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; if ((request_mask & STATX_BTIME) && EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } /* * Return the DIO alignment restrictions if requested. We only return * this information when requested, since on encrypted files it might * take a fair bit of work to get if the file wasn't opened recently. */ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { u32 dio_align = ext4_dio_alignment(inode); stat->result_mask |= STATX_DIOALIGN; if (dio_align == 1) { struct block_device *bdev = inode->i_sb->s_bdev; /* iomap defaults */ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } else { stat->dio_mem_align = dio_align; stat->dio_offset_align = dio_align; } } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & EXT4_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & EXT4_ENCRYPT_FL) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & EXT4_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; if (flags & EXT4_VERITY_FL) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | STATX_ATTR_VERITY); generic_fillattr(idmap, request_mask, inode, stat); return 0; } int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). * Report at least one sector for such files, so tools like tar, rsync, * others don't incorrectly think the file is completely sparse. */ if (unlikely(ext4_has_inline_data(inode))) stat->blocks += (stat->size + 511) >> 9; /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with * on-disk file blocks. * We always keep i_blocks updated together with real * allocation. But to not confuse with user, stat * will return the blocks that include the delayed allocation * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } static int ext4_index_trans_blocks(struct inode *inode, int lblocks, int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_ind_trans_blocks(inode, lblocks); return ext4_ext_index_trans_blocks(inode, pextents); } /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks */ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; int ret; /* * How many index blocks need to touch to map @lblocks logical blocks * to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; /* * Now let's see how many group bitmaps and group descriptors need * to account */ groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ ret += groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } /* * Calculate the total number of credits to reserve to fit * the modification of a single pages into a single transaction, * which may include multiple chunks of block allocations. * * This could be called via ext4_write_begin() * * We need to consider the worse case, when * one new block per extent. */ int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) ret += bpp; return ret; } /* * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. */ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) { return ext4_meta_trans_blocks(inode, nrblocks, 1); } /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err = 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } ext4_fc_track_inode(handle, inode); /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh); return err; } /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; } } ext4_std_error(inode->i_sb, err); return err; } static int __ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc, handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int error; /* this was checked at iget time, but double check for good measure */ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || (ei->i_extra_isize & 3)) { EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); return -EFSCORRUPTED; } if ((new_extra_isize < ei->i_extra_isize) || (new_extra_isize < 4) || (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) return -EINVAL; /* Should never happen */ raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize, 0, new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } /* * We may need to allocate external xattr block so we need quotas * initialized. Here we can be called with various locks held so we * cannot affort to initialize quotas ourselves. So just bail. */ if (dquot_initialize_needed(inode)) return -EAGAIN; /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); if (error) { /* * Inode size expansion failed; don't try again */ *no_expand = 1; } return error; } /* * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. */ static int ext4_try_to_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc iloc, handle_t *handle) { int no_expand; int error; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) return -EOVERFLOW; /* * In nojournal mode, we can immediately attempt to expand * the inode. When journaled, we first need to obtain extra * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ if (ext4_journal_extend(handle, EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) return -EBUSY; error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, handle, &no_expand); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc) { handle_t *handle; int no_expand; int error, rc; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { brelse(iloc->bh); return -EOVERFLOW; } handle = ext4_journal_start(inode, EXT4_HT_INODE, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); brelse(iloc->bh); return error; } ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, handle, &no_expand); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!error) error = rc; out_unlock: ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); return error; } /* * What we do here is to mark the in-core inode as clean with respect to inode * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which * have a transaction open against a different journal. * * Is this cheating? Not really. Sure, we haven't written the * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); err = ext4_mark_iloc_dirty(handle, inode, &iloc); out: if (unlikely(err)) ext4_error_inode_err(inode, func, line, 0, err, "mark_inode_dirty error"); return err; } /* * ext4_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return; ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; handle_t *handle; int err; int alloc_ctx; /* * We have to be very careful here: changing a data block's * journaling status dynamically is dangerous. If we write a * data block to the journal, change the status and then delete * that block, we risk forgetting to revoke the old log record * from the journal and so a subsequent replay can corrupt data. * So, first we make sure that the journal is empty and that * nobody is changing anything. */ journal = EXT4_JOURNAL(inode); if (!journal) return 0; if (is_journal_aborted(journal)) return -EROFS; /* Wait for all existing dio workers */ inode_dio_wait(inode); /* * Before flushing the journal and switching inode's aops, we have * to flush all dirty data the inode has. There can be outstanding * delayed allocations, there can be unwritten extents created by * fallocate or buffered writes in dioread_nolock mode covered by * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ if (val) { filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { filemap_invalidate_unlock(inode->i_mapping); return err; } } alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* * OK, there are no updates running now, and all cached data is * synced to disk. We are now in a completely consistent state * which doesn't have anything in the journal, and we know that * no filesystem updates are running, so it is safe to modify * the inode's in-core data-journaling state flag now. */ if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); return err; } static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return !buffer_mapped(bh); } vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; handle_t *handle; get_block_t *get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) goto out_ret; /* * On data journalling we skip straight to the transaction handle: * there's no delalloc; page truncated will be checked later; the * early return w/ all buffers mapped (calculates size/len) can't * be used; and there's no dioread_nolock, so only ext4_get_block. */ if (ext4_should_journal_data(inode)) goto retry_alloc; /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) { do { err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time * * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ if (folio_buffers(folio)) { if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; } /* * Data journalling can't use block_page_mkwrite() because it * will set_buffer_dirty() before do_journal_get_write_access() * thus might hit warning messages for dirty metadata buffers. */ if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) goto out_error; } else { folio_unlock(folio); } } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: folio_unlock(folio); ext4_journal_stop(handle); goto out; } |
57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 | // SPDX-License-Identifier: GPL-2.0-or-later /* * zswap.c - zswap driver file * * zswap is a cache that takes pages that are in the process * of being swapped out and attempts to compress and store them in a * RAM-based memory pool. This can result in a significant I/O reduction on * the swap device and, in the case where decompressing from RAM is faster * than reading from the swap device, can also improve workload performance. * * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/mempolicy.h> #include <linux/mempool.h> #include <linux/zpool.h> #include <crypto/acompress.h> #include <linux/zswap.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/swapops.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/workqueue.h> #include <linux/list_lru.h> #include "swap.h" #include "internal.h" /********************************* * statistics **********************************/ /* Total bytes used by the compressed storage */ u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for * performance reasons so they may not be a 100% accurate. However, * they do provide useful information on roughly how many times a * certain event is occurring. */ /* Pool limit was hit (see zswap_max_pool_percent) */ static u64 zswap_pool_limit_hit; /* Pages written back when pool limit was reached */ static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; /* Store failed due to compression algorithm failure */ static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; /* Duplicate store was encountered (rare) */ static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; /* Pool limit was hit, we need to calm down */ static bool zswap_pool_reached_full; /********************************* * tunables **********************************/ #define ZSWAP_PARAM_UNSET "" static int zswap_setup(void); /* Enable/disable zswap */ static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_enabled_param_ops = { .set = zswap_enabled_param_set, .get = param_get_bool, }; module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); /* Compressed storage zpool to use */ static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); /* The threshold for accepting new pages after the max_pool_percent was hit */ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); /* * Enable/disable handling same-value filled pages (enabled by default). * If disabled every page is considered non-same-value filled. */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, bool, 0644); /* Enable/disable handling non-same-value filled pages (enabled by default) */ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); static bool zswap_exclusive_loads_enabled = IS_ENABLED( CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 /* Enable/disable memory pressure-based shrinker. */ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); bool is_zswap_enabled(void) { return zswap_enabled; } /********************************* * data structures **********************************/ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; u8 *buffer; struct mutex mutex; }; /* * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. * The only case where lru_lock is not acquired while holding tree.lock is * when a zswap_entry is taken off the lru for writeback, in that case it * needs to be verified that it's still valid in the tree. */ struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; struct work_struct release_work; struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; struct list_lru list_lru; struct mem_cgroup *next_shrink; struct shrinker *shrinker; atomic_t nr_stored; }; /* * struct zswap_entry * * This structure contains the metadata for tracking a single compressed * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { struct rb_node rbnode; swp_entry_t swpentry; int refcount; unsigned int length; struct zswap_pool *pool; union { unsigned long handle; unsigned long value; }; struct obj_cgroup *objcg; struct list_head lru; }; /* * The tree lock in the zswap_tree struct protects a few things: * - the rbtree * - the refcount field of each entry in the tree */ struct zswap_tree { struct rb_root rbroot; spinlock_t lock; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); /* pool counter to provide unique names to zpool */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); enum zswap_init_type { ZSWAP_UNINIT, ZSWAP_INIT_SUCCEED, ZSWAP_INIT_FAILED }; static enum zswap_init_type zswap_init_state; /* used to ensure the integrity of initialization */ static DEFINE_MUTEX(zswap_init_lock); /* init completed, but couldn't create the initial pool */ static bool zswap_has_pool; /********************************* * helpers and fwd declarations **********************************/ #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static bool zswap_can_accept(void) { return totalram_pages() * zswap_accept_thr_percent / 100 * zswap_max_pool_percent / 100 > DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static u64 get_zswap_pool_size(struct zswap_pool *pool) { u64 pool_size = 0; int i; for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) pool_size += zpool_get_total_size(pool->zpools[i]); return pool_size; } static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) total += get_zswap_pool_size(pool); rcu_read_unlock(); zswap_pool_total_size = total; } /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; } #else static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return NULL; } #endif static inline int entry_to_nid(struct zswap_entry *entry) { return page_to_nid(virt_to_page(entry)); } void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { struct zswap_pool *pool; /* lock out zswap pools list modification */ spin_lock(&zswap_pools_lock); list_for_each_entry(pool, &zswap_pools, list) { if (pool->next_shrink == memcg) pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); } spin_unlock(&zswap_pools_lock); } /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1; RB_CLEAR_NODE(&entry->rbnode); return entry; } static void zswap_entry_cache_free(struct zswap_entry *entry) { kmem_cache_free(zswap_entry_cache, entry); } /********************************* * zswap lruvec functions **********************************/ void zswap_lruvec_state_init(struct lruvec *lruvec) { atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); } void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; if (folio) { lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } } /********************************* * lru functions **********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { atomic_long_t *nr_zswap_protected; unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection * used in list_lru lookup, only two scenarios are possible: * * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The * new entry will be reparented to memcg's parent's list_lru. * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del() and list_lru_putback(). */ rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); /* Update the protection area */ lru_size = list_lru_count_one(list_lru, nid, memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; old = atomic_long_inc_return(nr_zswap_protected); /* * Decay to avoid overflow and adapt to changing workloads. * This is based on LRU reclaim cost decaying heuristics. */ do { new = old > lru_size / 4 ? old / 2 : old; } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); struct mem_cgroup *memcg; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_del(list_lru, &entry->lru, nid, memcg); rcu_read_unlock(); } static void zswap_lru_putback(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); spinlock_t *lock = &list_lru->node[nid].lock; struct mem_cgroup *memcg; struct lruvec *lruvec; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); spin_lock(lock); /* we cannot use list_lru_add here, because it increments node's lru count */ list_lru_putback(list_lru, &entry->lru, nid, memcg); spin_unlock(lock); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); /* increment the protection area to account for the LRU rotation. */ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); rcu_read_unlock(); } /********************************* * rbtree functions **********************************/ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) { struct rb_node *node = root->rb_node; struct zswap_entry *entry; pgoff_t entry_offset; while (node) { entry = rb_entry(node, struct zswap_entry, rbnode); entry_offset = swp_offset(entry->swpentry); if (entry_offset > offset) node = node->rb_left; else if (entry_offset < offset) node = node->rb_right; else return entry; } return NULL; } /* * In the case that a entry with the same offset is found, a pointer to * the existing entry is stored in dupentry and the function returns -EEXIST */ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, struct zswap_entry **dupentry) { struct rb_node **link = &root->rb_node, *parent = NULL; struct zswap_entry *myentry; pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); while (*link) { parent = *link; myentry = rb_entry(parent, struct zswap_entry, rbnode); myentry_offset = swp_offset(myentry->swpentry); if (myentry_offset > entry_offset) link = &(*link)->rb_left; else if (myentry_offset < entry_offset) link = &(*link)->rb_right; else { *dupentry = myentry; return -EEXIST; } } rb_link_node(&entry->rbnode, parent, link); rb_insert_color(&entry->rbnode, root); return 0; } static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) { if (!RB_EMPTY_NODE(&entry->rbnode)) { rb_erase(&entry->rbnode, root); RB_CLEAR_NODE(&entry->rbnode); return true; } return false; } static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { int i = 0; if (ZSWAP_NR_ZPOOLS > 1) i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); return entry->pool->zpools[i]; } /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); } /* caller must hold the tree lock */ static void zswap_entry_get(struct zswap_entry *entry) { entry->refcount++; } /* caller must hold the tree lock * remove from the tree and free it, if nobody reference the entry */ static void zswap_entry_put(struct zswap_tree *tree, struct zswap_entry *entry) { int refcount = --entry->refcount; WARN_ON_ONCE(refcount < 0); if (refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_free_entry(entry); } } /* caller must hold the tree lock */ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) zswap_entry_get(entry); return entry; } /********************************* * shrinker functions **********************************/ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg); static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); unsigned long shrink_ret, nr_protected, lru_size; struct zswap_pool *pool = shrinker->private_data; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { sc->nr_scanned = 0; return SHRINK_STOP; } nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); lru_size = list_lru_shrink_count(&pool->list_lru, sc); /* * Abort if we are shrinking into the protected region. * * This short-circuiting is necessary because if we have too many multiple * concurrent reclaimers getting the freeable zswap object counts at the * same time (before any of them made reasonable progress), the total * number of reclaimed objects might be more than the number of unprotected * objects (i.e the reclaimers will reclaim into the protected area of the * zswap LRU). */ if (nr_protected >= lru_size - sc->nr_to_scan) { sc->nr_scanned = 0; return SHRINK_STOP; } shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) return SHRINK_STOP; return shrink_ret ? shrink_ret : SHRINK_STOP; } static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { struct zswap_pool *pool = shrinker->private_data; struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; #ifdef CONFIG_MEMCG_KMEM mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); #else /* use pool stats instead of memcg stats */ nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; nr_stored = atomic_read(&pool->nr_stored); #endif if (!nr_stored) return 0; nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); /* * Subtract the lru size by an estimate of the number of pages * that should be protected. */ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; /* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). */ return mult_frac(nr_freeable, nr_backing, nr_stored); } static void zswap_alloc_shrinker(struct zswap_pool *pool) { pool->shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); if (!pool->shrinker) return; pool->shrinker->private_data = pool; pool->shrinker->scan_objects = zswap_shrinker_scan; pool->shrinker->count_objects = zswap_shrinker_count; pool->shrinker->batch = 0; pool->shrinker->seeks = DEFAULT_SEEKS; } /********************************* * per-cpu code **********************************/ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct crypto_acomp *acomp; struct acomp_req *req; int ret; mutex_init(&acomp_ctx->mutex); acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!acomp_ctx->buffer) return -ENOMEM; acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); goto acomp_fail; } acomp_ctx->acomp = acomp; req = acomp_request_alloc(acomp_ctx->acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; goto req_fail; } acomp_ctx->req = req; crypto_init_wait(&acomp_ctx->wait); /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking. */ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); return 0; req_fail: crypto_free_acomp(acomp_ctx->acomp); acomp_fail: kfree(acomp_ctx->buffer); return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); if (!IS_ERR_OR_NULL(acomp_ctx)) { if (!IS_ERR_OR_NULL(acomp_ctx->req)) acomp_request_free(acomp_ctx->req); if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); kfree(acomp_ctx->buffer); } return 0; } /********************************* * pool functions **********************************/ static struct zswap_pool *__zswap_pool_current(void) { struct zswap_pool *pool; pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); WARN_ONCE(!pool && zswap_has_pool, "%s: no page storage pool!\n", __func__); return pool; } static struct zswap_pool *zswap_pool_current(void) { assert_spin_locked(&zswap_pools_lock); return __zswap_pool_current(); } static struct zswap_pool *zswap_pool_current_get(void) { struct zswap_pool *pool; rcu_read_lock(); pool = __zswap_pool_current(); if (!zswap_pool_get(pool)) pool = NULL; rcu_read_unlock(); return pool; } static struct zswap_pool *zswap_pool_last_get(void) { struct zswap_pool *pool, *last = NULL; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) last = pool; WARN_ONCE(!last && zswap_has_pool, "%s: no page storage pool!\n", __func__); if (!zswap_pool_get(last)) last = NULL; rcu_read_unlock(); return last; } /* type and compressor must be null-terminated */ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { struct zswap_pool *pool; assert_spin_locked(&zswap_pools_lock); list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; /* all zpools share the same type */ if (strcmp(zpool_get_type(pool->zpools[0]), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_get(pool)) continue; return pool; } return NULL; } /* * If the entry is still valid in the tree, drop the initial ref and remove it * from the tree. This function must be called with an additional ref held, * otherwise it may race with another invalidation freeing the entry. */ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { if (zswap_rb_erase(&tree->rbroot, entry)) zswap_entry_put(tree, entry); } static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; struct zswap_tree *tree; pgoff_t swpoffset; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; /* * Once the lru lock is dropped, the entry might get freed. The * swpoffset is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ swpoffset = swp_offset(entry->swpentry); tree = zswap_trees[swp_type(entry->swpentry)]; list_lru_isolate(l, item); /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ spin_unlock(lock); /* Check for invalidate() race */ spin_lock(&tree->lock); if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) goto unlock; /* Hold a reference to prevent a free during writeback */ zswap_entry_get(entry); spin_unlock(&tree->lock); writeback_result = zswap_writeback_entry(entry, tree); spin_lock(&tree->lock); if (writeback_result) { zswap_reject_reclaim_fail++; zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; /* * Encountering a page already in swap cache is a sign that we are shrinking * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ if (writeback_result == -EEXIST && encountered_page_in_swapcache) { ret = LRU_SKIP; *encountered_page_in_swapcache = true; } goto put_unlock; } zswap_written_back_pages++; if (entry->objcg) count_objcg_event(entry->objcg, ZSWPWB); count_vm_event(ZSWPWB); /* * Writeback started successfully, the page now belongs to the * swapcache. Drop the entry from zswap - unless invalidate already * took it out while we had the tree->lock released for IO. */ zswap_invalidate_entry(tree, entry); put_unlock: /* Drop local reference */ zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock); spin_lock(lock); return ret; } static int shrink_memcg(struct mem_cgroup *memcg) { struct zswap_pool *pool; int nid, shrunk = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) return -EINVAL; /* * Skip zombies because their LRUs are reparented and we would be * reclaiming from the parent instead of the dead memcg. */ if (memcg && !mem_cgroup_online(memcg)) return -ENOENT; pool = zswap_pool_current_get(); if (!pool) return -EINVAL; for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); } zswap_pool_put(pool); return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); struct mem_cgroup *memcg; int ret, failures = 0; /* global reclaim will select cgroup in a round-robin fashion. */ do { spin_lock(&zswap_pools_lock); pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); memcg = pool->next_shrink; /* * We need to retry if we have gone through a full round trip, or if we * got an offline memcg (or else we risk undoing the effect of the * zswap memcg offlining cleanup callback). This is not catastrophic * per se, but it will keep the now offlined memcg hostage for a while. * * Note that if we got an online memcg, we will keep the extra * reference in case the original reference obtained by mem_cgroup_iter * is dropped by the zswap memcg offlining callback, ensuring that the * memcg is not killed when we are reclaiming. */ if (!memcg) { spin_unlock(&zswap_pools_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } if (!mem_cgroup_tryget_online(memcg)) { /* drop the reference from mem_cgroup_iter() */ mem_cgroup_iter_break(NULL, memcg); pool->next_shrink = NULL; spin_unlock(&zswap_pools_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } spin_unlock(&zswap_pools_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); if (ret == -EINVAL) break; if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: cond_resched(); } while (!zswap_can_accept()); zswap_pool_put(pool); } static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { int i; struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; int ret; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we * need both params to be set correctly before trying to * create a pool. */ if (!strcmp(type, ZSWAP_PARAM_UNSET)) return NULL; if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) return NULL; } pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); pool->zpools[i] = zpool_create_pool(type, name, gfp); if (!pool->zpools[i]) { pr_err("%s zpool not available\n", type); goto error; } } pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) goto error; zswap_alloc_shrinker(pool); if (!pool->shrinker) goto error; pr_debug("using %s compressor\n", pool->tfm_name); /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) goto lru_fail; shrinker_register(pool->shrinker); INIT_WORK(&pool->shrink_work, shrink_worker); atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; lru_fail: list_lru_destroy(&pool->list_lru); shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); while (i--) zpool_destroy_pool(pool->zpools[i]); kfree(pool); return NULL; } static struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); if (!has_comp && strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); } if (!has_comp) { pr_err("default compressor %s not available\n", zswap_compressor); param_free_charp(&zswap_compressor); zswap_compressor = ZSWAP_PARAM_UNSET; } has_zpool = zpool_has_pool(zswap_zpool_type); if (!has_zpool && strcmp(zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT)) { pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); param_free_charp(&zswap_zpool_type); zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; has_zpool = zpool_has_pool(zswap_zpool_type); } if (!has_zpool) { pr_err("default zpool %s not available\n", zswap_zpool_type); param_free_charp(&zswap_zpool_type); zswap_zpool_type = ZSWAP_PARAM_UNSET; } if (!has_comp || !has_zpool) return NULL; return zswap_pool_create(zswap_zpool_type, zswap_compressor); } static void zswap_pool_destroy(struct zswap_pool *pool) { int i; zswap_pool_debug("destroying", pool); shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); list_lru_destroy(&pool->list_lru); spin_lock(&zswap_pools_lock); mem_cgroup_iter_break(NULL, pool->next_shrink); pool->next_shrink = NULL; spin_unlock(&zswap_pools_lock); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); } static int __must_check zswap_pool_get(struct zswap_pool *pool) { if (!pool) return 0; return kref_get_unless_zero(&pool->kref); } static void __zswap_pool_release(struct work_struct *work) { struct zswap_pool *pool = container_of(work, typeof(*pool), release_work); synchronize_rcu(); /* nobody should have been able to get a kref... */ WARN_ON(kref_get_unless_zero(&pool->kref)); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); } static void __zswap_pool_empty(struct kref *kref) { struct zswap_pool *pool; pool = container_of(kref, typeof(*pool), kref); spin_lock(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); spin_unlock(&zswap_pools_lock); } static void zswap_pool_put(struct zswap_pool *pool) { kref_put(&pool->kref, __zswap_pool_empty); } /********************************* * param callbacks **********************************/ static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) { /* no change required */ if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) return false; return true; } /* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; char *s = strstrip((char *)val); int ret = 0; bool new_pool = false; mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ ret = param_set_charp(s, kp); break; case ZSWAP_INIT_SUCCEED: new_pool = zswap_pool_changed(s, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't set param, initialization failed\n"); ret = -ENODEV; } mutex_unlock(&zswap_init_lock); /* no need to create a new pool, return directly */ if (!new_pool) return ret; if (!type) { if (!zpool_has_pool(s)) { pr_err("zpool %s not available\n", s); return -ENOENT; } type = s; } else if (!compressor) { if (!crypto_has_acomp(s, 0, 0)) { pr_err("compressor %s not available\n", s); return -ENOENT; } compressor = s; } else { WARN_ON(1); return -EINVAL; } spin_lock(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { zswap_pool_debug("using existing", pool); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); } spin_unlock(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; spin_lock(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); list_add_rcu(&pool->list, &zswap_pools); zswap_has_pool = true; } else if (pool) { /* add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and * destroyed by the put after we drop the lock */ list_add_tail_rcu(&pool->list, &zswap_pools); put_pool = pool; } spin_unlock(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also * failed, maybe both compressor and zpool params were bad. * Allow changing this param, so pool creation will succeed * when the other param is changed. We already verified this * param is ok in the zpool_has_pool() or crypto_has_acomp() * checks above. */ ret = param_set_charp(s, kp); } /* drop the ref from either the old current pool, * or the new pool we failed to add */ if (put_pool) zswap_pool_put(put_pool); return ret; } static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, zswap_zpool_type, NULL); } static int zswap_zpool_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, NULL, zswap_compressor); } static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { int ret = -ENODEV; /* if this is load-time (pre-init) param setting, only set param. */ if (system_state != SYSTEM_RUNNING) return param_set_bool(val, kp); mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: if (zswap_setup()) break; fallthrough; case ZSWAP_INIT_SUCCEED: if (!zswap_has_pool) pr_err("can't enable, no pool configured\n"); else ret = param_set_bool(val, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't enable, initialization failed\n"); } mutex_unlock(&zswap_init_lock); return ret; } static void __zswap_load(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; u8 *src; acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(zpool)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); } sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); } /********************************* * writeback code **********************************/ /* * Attempts to free an entry by adding a folio to the swap cache, * decompressing the entry data into the folio, and issuing a * bio write to write the folio back to the swap device. * * This can be thought of as a "resumed writeback" of the folio * to the swap device. We are basically resuming the same swap * writeback path that was intercepted with the zswap_store() * in the first place. After the folio has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree) { swp_entry_t swpentry = entry->swpentry; struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; /* try to allocate swap cache folio */ mpol = get_task_policy(current); folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, NO_INTERLEAVE_INDEX, &folio_was_allocated, true); if (!folio) return -ENOMEM; /* * Found an existing folio, we raced with load/swapin. We generally * writeback cold folios from zswap, and swapin means the folio just * became hot. Skip this folio and let the caller find another one. */ if (!folio_was_allocated) { folio_put(folio); return -EEXIST; } /* * folio is locked, and the swapcache is now secured against * concurrent swapping to and from the slot. Verify that the * swap entry hasn't been invalidated and recycled behind our * backs (our zswap_entry reference doesn't prevent that), to * avoid overwriting a new swap folio with old compressed data. */ spin_lock(&tree->lock); if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); return -ENOMEM; } spin_unlock(&tree->lock); __zswap_load(entry, &folio->page); /* folio is up to date */ folio_mark_uptodate(folio); /* move it to the tail of the inactive list after end_writeback */ folio_set_reclaim(folio); /* start writeback */ __swap_writepage(folio, &wbc); folio_put(folio); return 0; } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned long *page; unsigned long val; unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; page = (unsigned long *)ptr; val = page[0]; if (val != page[last_pos]) return 0; for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) return 0; } *value = val; return 1; } static void zswap_fill_page(void *ptr, unsigned long value) { unsigned long *page; page = (unsigned long *)ptr; memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; gfp_t gfp; int ret; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); /* Large folios aren't supported */ if (folio_test_large(folio)) return false; if (!zswap_enabled || !tree) return false; /* * If this is a duplicate, it must be removed before attempting to store * it, otherwise, if the store fails the old page won't be removed from * the tree, and it might be written back overriding the new data. */ spin_lock(&tree->lock); dupentry = zswap_rb_search(&tree->rbroot, offset); if (dupentry) { zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); if (shrink_memcg(memcg)) { mem_cgroup_put(memcg); goto reject; } mem_cgroup_put(memcg); } /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; goto shrink; } if (zswap_pool_reached_full) { if (!zswap_can_accept()) goto shrink; else zswap_pool_reached_full = false; } /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); goto insert_entry; } kunmap_local(src); } if (!zswap_non_same_filled_pages_enabled) goto freepage; /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) goto freepage; if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } mem_cgroup_put(memcg); } /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, &folio->page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, * and hardware-accelerators may won't check the dst buffer size, so * giving the dst buffer with enough length to avoid buffer overflow. */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* * it maybe looks a little bit silly that we send an asynchronous request, * then wait for its completion synchronously. This makes the process look * synchronous in fact. * Theoretically, acomp supports users send multiple acomp requests in one * acomp instance, then get those requests done simultaneously. but in this * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done * in one thread doing zwap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; if (ret) { zswap_reject_compress_fail++; goto put_dstmem; } /* store */ zpool = zswap_find_zpool(entry); gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; ret = zpool_malloc(zpool, dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; } if (ret) { zswap_reject_alloc_fail++; goto put_dstmem; } buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); zpool_unmap_handle(zpool, handle); mutex_unlock(&acomp_ctx->mutex); /* populate entry */ entry->swpentry = swp_entry(type, offset); entry->handle = handle; entry->length = dlen; insert_entry: entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); /* Account before objcg ref is moved to tree */ count_objcg_event(objcg, ZSWPOUT); } /* map */ spin_lock(&tree->lock); /* * A duplicate entry should have been removed at the beginning of this * function. Since the swap entry should be pinned, if a duplicate is * found again here it means that something went wrong in the swap * cache. */ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { WARN_ON(1); zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&entry->pool->list_lru, entry); atomic_inc(&entry->pool->nr_stored); } spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); zswap_update_total_size(); count_vm_event(ZSWPOUT); return true; put_dstmem: mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); reject: if (objcg) obj_cgroup_put(objcg); return false; shrink: pool = zswap_pool_last_get(); if (pool && !queue_work(shrink_wq, &pool->shrink_work)) zswap_pool_put(pool); goto reject; } bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); /* find */ spin_lock(&tree->lock); entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } spin_unlock(&tree->lock); if (entry->length) __zswap_load(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); kunmap_local(dst); } count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); spin_lock(&tree->lock); if (zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { zswap_lru_del(&entry->pool->list_lru, entry); zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock); return true; } void zswap_invalidate(int type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return; } zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } void zswap_swapon(int type) { struct zswap_tree *tree; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return; } tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; } void zswap_swapoff(int type) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *n; if (!tree) return; /* walk the tree and free everything */ spin_lock(&tree->lock); rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) zswap_free_entry(entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); kfree(tree); zswap_trees[type] = NULL; } /********************************* * debugfs functions **********************************/ #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> static struct dentry *zswap_debugfs_root; static int zswap_debugfs_init(void) { if (!debugfs_initialized()) return -ENODEV; zswap_debugfs_root = debugfs_create_dir("zswap", NULL); debugfs_create_u64("pool_limit_hit", 0444, zswap_debugfs_root, &zswap_pool_limit_hit); debugfs_create_u64("reject_reclaim_fail", 0444, zswap_debugfs_root, &zswap_reject_reclaim_fail); debugfs_create_u64("reject_alloc_fail", 0444, zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", 0444, zswap_debugfs_root, &zswap_reject_kmemcache_fail); debugfs_create_u64("reject_compress_fail", 0444, zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", 0444, zswap_debugfs_root, &zswap_duplicate_entry); debugfs_create_u64("pool_total_size", 0444, zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, zswap_debugfs_root, &zswap_same_filled_pages); return 0; } #else static int zswap_debugfs_init(void) { return 0; } #endif /********************************* * module init and exit **********************************/ static int zswap_setup(void) { struct zswap_pool *pool; int ret; zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); if (!zswap_entry_cache) { pr_err("entry cache creation failed\n"); goto cache_fail; } ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, zswap_cpu_comp_dead); if (ret) goto hp_fail; pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, zpool_get_type(pool->zpools[0])); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; } else { pr_err("pool creation failed\n"); zswap_enabled = false; } shrink_wq = create_workqueue("zswap-shrink"); if (!shrink_wq) goto fallback_fail; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; fallback_fail: if (pool) zswap_pool_destroy(pool); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ zswap_init_state = ZSWAP_INIT_FAILED; zswap_enabled = false; return -ENOMEM; } static int __init zswap_init(void) { if (!zswap_enabled) return 0; return zswap_setup(); } /* must be late so crypto has time to come up */ late_initcall(zswap_init); MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); MODULE_DESCRIPTION("Compressed cache for swap pages"); |
1 106 3 106 1 105 12 105 71 72 4 2 1 3 17 8 7 7 6 5 5 6 7 16 16 16 16 2 15 15 5 15 15 15 1 15 62 18 18 1 101 101 58 101 36 19 1 15 80 99 100 100 100 22 22 19 19 7 4 4 16 38 38 37 36 17 17 17 35 26 26 25 23 16 10 6 10 2 1 1 1 3 10 2 1 1 2 8 8 7 5 4 2 20 23 10 20 18 11 11 2 11 80 100 42 13 84 84 59 1 84 59 13 59 71 83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 | // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The options processing module for ip.c * * Authors: A.N.Kuznetsov * */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/cipso_ipv4.h> #include <net/ip_fib.h> /* * Write options to IP header, record destination address to * source route option, address of outgoing interface * (we should already know it, so that this function is allowed be * called only after routing decision) and timestamp, * if we originate this datagram. * * daddr is real destination address, next hop is recorded in IP header. * saddr is address of outgoing interface. */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt) { unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); if (opt->srr) memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4); if (opt->rr_needaddr) ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt); if (opt->ts_needaddr) ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt); if (opt->ts_needtime) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4); } } /* * Provided (sopt, skb) points to received options, * build in dopt compiled option set appropriate for answering. * i.e. invert SRR option, copy anothers, * and grab room in RR/TS options. * * NOTE: dopt cannot point to skb. */ int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt) { unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); if (sopt->optlen == 0) return 0; sptr = skb_network_header(skb); dptr = dopt->__data; if (sopt->rr) { optlen = sptr[sopt->rr+1]; soffset = sptr[sopt->rr+2]; dopt->rr = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->rr, optlen); if (sopt->rr_needaddr && soffset <= optlen) { if (soffset + 3 > optlen) return -EINVAL; dptr[2] = soffset + 4; dopt->rr_needaddr = 1; } dptr += optlen; dopt->optlen += optlen; } if (sopt->ts) { optlen = sptr[sopt->ts+1]; soffset = sptr[sopt->ts+2]; dopt->ts = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->ts, optlen); if (soffset <= optlen) { if (sopt->ts_needaddr) { if (soffset + 3 > optlen) return -EINVAL; dopt->ts_needaddr = 1; soffset += 4; } if (sopt->ts_needtime) { if (soffset + 3 > optlen) return -EINVAL; if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { dopt->ts_needtime = 1; soffset += 4; } else { dopt->ts_needtime = 0; if (soffset + 7 <= optlen) { __be32 addr; memcpy(&addr, dptr+soffset-1, 4); if (inet_addr_type(net, addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } } } } dptr[2] = soffset; } dptr += optlen; dopt->optlen += optlen; } if (sopt->srr) { unsigned char *start = sptr+sopt->srr; __be32 faddr; optlen = start[1]; soffset = start[2]; doffset = 0; if (soffset > optlen) soffset = optlen + 1; soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. */ if (memcmp(&ip_hdr(skb)->saddr, &start[soffset + 3], 4) == 0) doffset -= 4; } if (doffset > 3) { dopt->faddr = faddr; dptr[0] = start[0]; dptr[1] = doffset+3; dptr[2] = 4; dptr += doffset+3; dopt->srr = dopt->optlen + sizeof(struct iphdr); dopt->optlen += doffset+3; dopt->is_strictroute = sopt->is_strictroute; } } if (sopt->cipso) { optlen = sptr[sopt->cipso+1]; dopt->cipso = dopt->optlen+sizeof(struct iphdr); memcpy(dptr, sptr+sopt->cipso, optlen); dptr += optlen; dopt->optlen += optlen; } while (dopt->optlen & 3) { *dptr++ = IPOPT_END; dopt->optlen++; } return 0; } /* * Options "fragmenting", just fill options not * allowed in fragments with NOOPs. * Simple and stupid 8), but the most efficient way. */ void ip_options_fragment(struct sk_buff *skb) { unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); struct ip_options *opt = &(IPCB(skb)->opt); int l = opt->optlen; int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); l -= optlen; optptr += optlen; } opt->ts = 0; opt->rr = 0; opt->rr_needaddr = 0; opt->ts_needaddr = 0; opt->ts_needtime = 0; } /* helper used by ip_options_compile() to call fib_compute_spec_dst() * at most one time. */ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) { if (*spec_dst == htonl(INADDR_ANY)) *spec_dst = fib_compute_spec_dst(skb); } /* * Verify options and fill pointers in struct options. * Caller should clear *opt, and set opt->data. * If opt == NULL, then skb->data should point to IP header. */ int __ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb, __be32 *info) { __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; struct rtable *rt = NULL; unsigned char *optptr; unsigned char *iph; int optlen, l; if (skb) { rt = skb_rtable(skb); optptr = (unsigned char *)&(ip_hdr(skb)[1]); } else optptr = opt->__data; iph = optptr - sizeof(struct iphdr); for (l = opt->optlen; l > 0; ) { switch (*optptr) { case IPOPT_END: for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; } } goto eol; case IPOPT_NOOP: l--; optptr++; continue; } if (unlikely(l < 2)) { pp_ptr = optptr; goto error; } optlen = optptr[1]; if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } switch (*optptr) { case IPOPT_SSRR: case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } /* NB: cf RFC-1812 5.2.4.1 */ if (opt->srr) { pp_ptr = optptr; goto error; } if (!skb) { if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { pp_ptr = optptr + 1; goto error; } memcpy(&opt->faddr, &optptr[3], 4); if (optlen > 7) memmove(&optptr[3], &optptr[7], optlen-7); } opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; } if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; opt->rr_needaddr = 1; } opt->rr = optptr - iph; break; case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; } if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 5) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; case IPOPT_TS_TSANDADDR: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; optptr[2] += 8; break; case IPOPT_TS_PRESPEC: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); if (inet_addr_type(net, addr) == RTN_UNICAST) break; if (skb) timeptr = &optptr[optptr[2]+3]; } opt->ts_needtime = 1; optptr[2] += 8; break; default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; } break; } if (timeptr) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(timeptr, &midtime, 4); opt->is_changed = 1; } } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; goto error; } if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; case IPOPT_CIPSO: if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; } opt->cipso = optptr - iph; if (cipso_v4_validate(skb, &optptr)) { pp_ptr = optptr; goto error; } break; case IPOPT_SEC: case IPOPT_SID: default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; } break; } l -= optlen; optptr += optlen; } eol: if (!pp_ptr) return 0; error: if (info) *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) { int ret; __be32 info; ret = __ip_options_compile(net, opt, skb, &info); if (ret != 0 && skb) icmp_send(skb, ICMP_PARAMETERPROB, 0, info); return ret; } EXPORT_SYMBOL(ip_options_compile); /* * Undo all the changes done by ip_options_compile(). */ void ip_options_undo(struct ip_options *opt) { if (opt->srr) { unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr); memmove(optptr + 7, optptr + 3, optptr[1] - 7); memcpy(optptr + 3, &opt->faddr, 4); } if (opt->rr_needaddr) { unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr); optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } if (opt->ts) { unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr); if (opt->ts_needtime) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC) optptr[2] -= 4; } if (opt->ts_needaddr) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } } } int ip_options_get(struct net *net, struct ip_options_rcu **optp, sockptr_t data, int optlen) { struct ip_options_rcu *opt; opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); if (!opt) return -ENOMEM; if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) { kfree(opt); return -EFAULT; } while (optlen & 3) opt->opt.__data[optlen++] = IPOPT_END; opt->opt.optlen = optlen; if (optlen && ip_options_compile(net, &opt->opt, NULL)) { kfree(opt); return -EINVAL; } kfree(*optp); *optp = opt; return 0; } void ip_forward_options(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); unsigned char *optptr; struct rtable *rt = skb_rtable(skb); unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); opt->is_changed = 1; } if (opt->srr_is_hit) { int srrptr, srrspace; optptr = raw + opt->srr; for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { if (srrptr + 3 > srrspace) break; if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_hdr(skb)->daddr = opt->nexthop; ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else { net_crit_ratelimited("%s(): Argh! Destination lost!\n", __func__); } if (opt->ts_needaddr) { optptr = raw + opt->ts; ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); opt->is_changed = 1; } } if (opt->is_changed) { opt->is_changed = 0; ip_send_check(ip_hdr(skb)); } } int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt = &(IPCB(skb)->opt); int srrspace, srrptr; __be32 nexthop; struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = skb_rtable(skb); struct rtable *rt2; unsigned long orefdst; int err; if (!rt) return 0; if (skb->pkt_type != PACKET_HOST) return -EINVAL; if (rt->rt_type == RTN_UNICAST) { if (!opt->is_strictroute) return 0; icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); return -EINVAL; } if (rt->rt_type != RTN_LOCAL) return -EINVAL; for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; } memcpy(&nexthop, &optptr[srrptr-1], 4); orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); skb->_skb_refdst = orefdst; return -EINVAL; } refdst_drop(orefdst); if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ iph->daddr = nexthop; opt->is_changed = 1; } if (srrptr <= srrspace) { opt->srr_is_hit = 1; opt->nexthop = nexthop; opt->is_changed = 1; } return 0; } EXPORT_SYMBOL(ip_options_rcv_srr); |
2 2 1 1 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 | // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/rhashtable.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/seg6.h> #include <net/genetlink.h> #include <linux/seg6.h> #include <linux/seg6_genl.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) { unsigned int tlv_offset; int max_last_entry; int trailing; if (srh->type != IPV6_SRCRT_TYPE_4) return false; if (((srh->hdrlen + 1) << 3) != len) return false; if (!reduced && srh->segments_left > srh->first_segment) { return false; } else { max_last_entry = (srh->hdrlen / 2) - 1; if (srh->first_segment > max_last_entry) return false; if (srh->segments_left > srh->first_segment + 1) return false; } tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); trailing = len - tlv_offset; if (trailing < 0) return false; while (trailing) { struct sr6_tlv *tlv; unsigned int tlv_len; if (trailing < sizeof(*tlv)) return false; tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset); tlv_len = sizeof(*tlv) + tlv->len; trailing -= tlv_len; if (trailing < 0) return false; tlv_offset += tlv_len; } return true; } struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags) { struct ipv6_sr_hdr *srh; int len, srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0) return NULL; if (!pskb_may_pull(skb, srhoff + sizeof(*srh))) return NULL; srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); len = (srh->hdrlen + 1) << 3; if (!pskb_may_pull(skb, srhoff + len)) return NULL; /* note that pskb_may_pull may change pointers in header; * for this reason it is necessary to reload them when needed. */ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); if (!seg6_validate_srh(srh, len, true)) return NULL; return srh; } /* Determine if an ICMP invoking packet contains a segment routing * header. If it does, extract the offset to the true destination * address, which is in the first segment address. */ void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt) { __u16 network_header = skb->network_header; struct ipv6_sr_hdr *srh; /* Update network header to point to the invoking packet * inside the ICMP packet, so we can use the seg6_get_srh() * helper. */ skb_reset_network_header(skb); srh = seg6_get_srh(skb, 0); if (!srh) goto out; if (srh->type != IPV6_SRCRT_TYPE_4) goto out; opt->flags |= IP6SKB_SEG6; opt->srhoff = (unsigned char *)srh - skb->data; out: /* Restore the network header back to the ICMP packet */ skb->network_header = network_header; } static struct genl_family seg6_genl_family; static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = { [SEG6_ATTR_DST] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [SEG6_ATTR_DSTLEN] = { .type = NLA_S32, }, [SEG6_ATTR_HMACKEYID] = { .type = NLA_U32, }, [SEG6_ATTR_SECRET] = { .type = NLA_BINARY, }, [SEG6_ATTR_SECRETLEN] = { .type = NLA_U8, }, [SEG6_ATTR_ALGID] = { .type = NLA_U8, }, [SEG6_ATTR_HMACINFO] = { .type = NLA_NESTED, }, }; #ifdef CONFIG_IPV6_SEG6_HMAC static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct seg6_pernet_data *sdata; struct seg6_hmac_info *hinfo; u32 hmackeyid; char *secret; int err = 0; u8 algid; u8 slen; sdata = seg6_pernet(net); if (!info->attrs[SEG6_ATTR_HMACKEYID] || !info->attrs[SEG6_ATTR_SECRETLEN] || !info->attrs[SEG6_ATTR_ALGID]) return -EINVAL; hmackeyid = nla_get_u32(info->attrs[SEG6_ATTR_HMACKEYID]); slen = nla_get_u8(info->attrs[SEG6_ATTR_SECRETLEN]); algid = nla_get_u8(info->attrs[SEG6_ATTR_ALGID]); if (hmackeyid == 0) return -EINVAL; if (slen > SEG6_HMAC_SECRET_LEN) return -EINVAL; mutex_lock(&sdata->lock); hinfo = seg6_hmac_info_lookup(net, hmackeyid); if (!slen) { err = seg6_hmac_info_del(net, hmackeyid); goto out_unlock; } if (!info->attrs[SEG6_ATTR_SECRET]) { err = -EINVAL; goto out_unlock; } if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) { err = -EINVAL; goto out_unlock; } if (hinfo) { err = seg6_hmac_info_del(net, hmackeyid); if (err) goto out_unlock; } secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]); hinfo = kzalloc(sizeof(*hinfo), GFP_KERNEL); if (!hinfo) { err = -ENOMEM; goto out_unlock; } memcpy(hinfo->secret, secret, slen); hinfo->slen = slen; hinfo->alg_id = algid; hinfo->hmackeyid = hmackeyid; err = seg6_hmac_info_add(net, hmackeyid, hinfo); if (err) kfree(hinfo); out_unlock: mutex_unlock(&sdata->lock); return err; } #else static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) { return -ENOTSUPP; } #endif static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct in6_addr *val, *t_old, *t_new; struct seg6_pernet_data *sdata; sdata = seg6_pernet(net); if (!info->attrs[SEG6_ATTR_DST]) return -EINVAL; val = nla_data(info->attrs[SEG6_ATTR_DST]); t_new = kmemdup(val, sizeof(*val), GFP_KERNEL); if (!t_new) return -ENOMEM; mutex_lock(&sdata->lock); t_old = sdata->tun_src; rcu_assign_pointer(sdata->tun_src, t_new); mutex_unlock(&sdata->lock); synchronize_net(); kfree(t_old); return 0; } static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct in6_addr *tun_src; struct sk_buff *msg; void *hdr; msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC); if (!hdr) goto free_msg; rcu_read_lock(); tun_src = rcu_dereference(seg6_pernet(net)->tun_src); if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src)) goto nla_put_failure; rcu_read_unlock(); genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); nla_put_failure: rcu_read_unlock(); free_msg: nlmsg_free(msg); return -ENOMEM; } #ifdef CONFIG_IPV6_SEG6_HMAC static int __seg6_hmac_fill_info(struct seg6_hmac_info *hinfo, struct sk_buff *msg) { if (nla_put_u32(msg, SEG6_ATTR_HMACKEYID, hinfo->hmackeyid) || nla_put_u8(msg, SEG6_ATTR_SECRETLEN, hinfo->slen) || nla_put(msg, SEG6_ATTR_SECRET, hinfo->slen, hinfo->secret) || nla_put_u8(msg, SEG6_ATTR_ALGID, hinfo->alg_id)) return -1; return 0; } static int __seg6_genl_dumphmac_element(struct seg6_hmac_info *hinfo, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &seg6_genl_family, flags, cmd); if (!hdr) return -ENOMEM; if (__seg6_hmac_fill_info(hinfo, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int seg6_genl_dumphmac_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct seg6_pernet_data *sdata; struct rhashtable_iter *iter; sdata = seg6_pernet(net); iter = (struct rhashtable_iter *)cb->args[0]; if (!iter) { iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long)iter; } rhashtable_walk_enter(&sdata->hmac_infos, iter); return 0; } static int seg6_genl_dumphmac_done(struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; rhashtable_walk_exit(iter); kfree(iter); return 0; } static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; struct seg6_hmac_info *hinfo; int ret; rhashtable_walk_start(iter); for (;;) { hinfo = rhashtable_walk_next(iter); if (IS_ERR(hinfo)) { if (PTR_ERR(hinfo) == -EAGAIN) continue; ret = PTR_ERR(hinfo); goto done; } else if (!hinfo) { break; } ret = __seg6_genl_dumphmac_element(hinfo, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, SEG6_CMD_DUMPHMAC); if (ret) goto done; } ret = skb->len; done: rhashtable_walk_stop(iter); return ret; } #else static int seg6_genl_dumphmac_start(struct netlink_callback *cb) { return 0; } static int seg6_genl_dumphmac_done(struct netlink_callback *cb) { return 0; } static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) { return -ENOTSUPP; } #endif static int __net_init seg6_net_init(struct net *net) { struct seg6_pernet_data *sdata; sdata = kzalloc(sizeof(*sdata), GFP_KERNEL); if (!sdata) return -ENOMEM; mutex_init(&sdata->lock); sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL); if (!sdata->tun_src) { kfree(sdata); return -ENOMEM; } net->ipv6.seg6_data = sdata; #ifdef CONFIG_IPV6_SEG6_HMAC if (seg6_hmac_net_init(net)) { kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); return -ENOMEM; } #endif return 0; } static void __net_exit seg6_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); #ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_net_exit(net); #endif kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); } static struct pernet_operations ip6_segments_ops = { .init = seg6_net_init, .exit = seg6_net_exit, }; static const struct genl_ops seg6_genl_ops[] = { { .cmd = SEG6_CMD_SETHMAC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_sethmac, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_DUMPHMAC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .start = seg6_genl_dumphmac_start, .dumpit = seg6_genl_dumphmac, .done = seg6_genl_dumphmac_done, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_SET_TUNSRC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_set_tunsrc, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_GET_TUNSRC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_get_tunsrc, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family seg6_genl_family __ro_after_init = { .hdrsize = 0, .name = SEG6_GENL_NAME, .version = SEG6_GENL_VERSION, .maxattr = SEG6_ATTR_MAX, .policy = seg6_genl_policy, .netnsok = true, .parallel_ops = true, .ops = seg6_genl_ops, .n_ops = ARRAY_SIZE(seg6_genl_ops), .resv_start_op = SEG6_CMD_GET_TUNSRC + 1, .module = THIS_MODULE, }; int __init seg6_init(void) { int err; err = genl_register_family(&seg6_genl_family); if (err) goto out; err = register_pernet_subsys(&ip6_segments_ops); if (err) goto out_unregister_genl; #ifdef CONFIG_IPV6_SEG6_LWTUNNEL err = seg6_iptunnel_init(); if (err) goto out_unregister_pernet; err = seg6_local_init(); if (err) goto out_unregister_pernet; #endif #ifdef CONFIG_IPV6_SEG6_HMAC err = seg6_hmac_init(); if (err) goto out_unregister_iptun; #endif pr_info("Segment Routing with IPv6\n"); out: return err; #ifdef CONFIG_IPV6_SEG6_HMAC out_unregister_iptun: #ifdef CONFIG_IPV6_SEG6_LWTUNNEL seg6_local_exit(); seg6_iptunnel_exit(); #endif #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL out_unregister_pernet: unregister_pernet_subsys(&ip6_segments_ops); #endif out_unregister_genl: genl_unregister_family(&seg6_genl_family); goto out; } void seg6_exit(void) { #ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_exit(); #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL seg6_iptunnel_exit(); #endif unregister_pernet_subsys(&ip6_segments_ops); genl_unregister_family(&seg6_genl_family); } |
48 18 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * * Vendor commands implementation based on net/wireless/nl80211.c * which is: * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/genetlink.h> #include <linux/nfc.h> #include <linux/slab.h> #include "nfc.h" #include "llcp.h" static const struct genl_multicast_group nfc_genl_mcgrps[] = { { .name = NFC_GENL_MCAST_EVENT_NAME, }, }; static struct genl_family nfc_genl_family; static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { [NFC_SDP_ATTR_URI] = { .type = NLA_STRING, .len = U8_MAX - 4 }, [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfc_genl_family, flags, NFC_CMD_GET_TARGET); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) || nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res)) goto nla_put_failure; if (target->nfcid1_len > 0 && nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len, target->nfcid1)) goto nla_put_failure; if (target->sensb_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len, target->sensb_res)) goto nla_put_failure; if (target->sensf_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len, target->sensf_res)) goto nla_put_failure; if (target->is_iso15693) { if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID, target->iso15693_dsfid) || nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID, sizeof(target->iso15693_uid), target->iso15693_uid)) goto nla_put_failure; } genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; u32 idx; if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return ERR_PTR(-ENODEV); return dev; } static int nfc_genl_dump_targets(struct sk_buff *skb, struct netlink_callback *cb) { int i = cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; int rc; if (!dev) { dev = __get_device_from_cb(cb); if (IS_ERR(dev)) return PTR_ERR(dev); cb->args[1] = (long) dev; } device_lock(&dev->dev); cb->seq = dev->targets_generation; while (i < dev->n_targets) { rc = nfc_genl_send_target(skb, &dev->targets[i], cb, NLM_F_MULTI); if (rc < 0) break; i++; } device_unlock(&dev->dev); cb->args[0] = i; return skb->len; } static int nfc_genl_dump_targets_done(struct netlink_callback *cb) { struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; if (dev) nfc_put_device(dev); return 0; } int nfc_genl_targets_found(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; dev->genl_data.poll_req_portid = 0; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGETS_FOUND); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGET_LOST); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_ACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_deactivated(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_DEACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg) { if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) return -1; return 0; } int nfc_genl_device_added(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_ADDED); if (!hdr) goto free_msg; if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_device_removed(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) { struct sk_buff *msg; struct nlattr *sdp_attr, *uri_attr; struct nfc_llcp_sdp_tlv *sdres; struct hlist_node *n; void *hdr; int rc = -EMSGSIZE; int i; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_LLC_SDRES); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP); if (sdp_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } i = 1; hlist_for_each_entry_safe(sdres, n, sdres_list, node) { pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); uri_attr = nla_nest_start_noflag(msg, i++); if (uri_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap)) goto nla_put_failure; if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri)) goto nla_put_failure; nla_nest_end(msg, uri_attr); hlist_del(&sdres->node); nfc_llcp_free_sdp_tlv(sdres); } nla_nest_end(msg, sdp_attr); genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); nfc_llcp_free_sdp_tlv_list(sdres_list); return rc; } int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_ADDED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_TRANSACTION); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) || nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len, evt_transaction->aid) || nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len, evt_transaction->params)) goto nla_put_failure; /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { const struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_CONNECTIVITY); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_DEVICE); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_devices_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is up\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (rf_mode == NFC_RF_INITIATOR && nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) || nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); dev->dep_link_up = true; genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_dep_link_down_event(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is down\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_DOWN); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct nfc_dev *dev; u32 idx; int rc = -ENOBUFS; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto out_putdev; } rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq, NULL, 0); if (rc < 0) goto out_free; nfc_put_device(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_putdev: nfc_put_device(dev); return rc; } static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_up(dev); nfc_put_device(dev); return rc; } static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; u32 im_protocols = 0, tm_protocols = 0; pr_debug("Poll start\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] && !info->attrs[NFC_ATTR_PROTOCOLS]) && !info->attrs[NFC_ATTR_TM_PROTOCOLS])) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (info->attrs[NFC_ATTR_TM_PROTOCOLS]) tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]); if (info->attrs[NFC_ATTR_IM_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]); else if (info->attrs[NFC_ATTR_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; mutex_lock(&dev->genl_data.genl_data_mutex); rc = nfc_start_poll(dev, im_protocols, tm_protocols); if (!rc) dev->genl_data.poll_req_portid = info->snd_portid; mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (!dev->polling) { device_unlock(&dev->dev); nfc_put_device(dev); return -EINVAL; } device_unlock(&dev->dev); mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid != info->snd_portid) { rc = -EBUSY; goto out; } rc = nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; out: mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx, protocol; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX] || !info->attrs[NFC_ATTR_PROTOCOLS]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); return rc; } static int nfc_genl_deactivate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc, tgt_idx; u32 idx; u8 comm; pr_debug("DEP link up\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_COMM_MODE]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (!info->attrs[NFC_ATTR_TARGET_INDEX]) tgt_idx = NFC_TARGET_IDX_ANY; else tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]); if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE) return -EINVAL; dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_up(dev, tgt_idx, comm); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_send_params(struct sk_buff *msg, struct nfc_llcp_local *local, u32 portid, u32 seq) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0, NFC_CMD_LLC_GET_PARAMS); if (!hdr) return -EMSGSIZE; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) || nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; int rc = 0; struct sk_buff *msg = NULL; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto put_local; } rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); if (rc < 0) { if (msg) nlmsg_free(msg); return rc; } return genlmsg_reply(msg, info); } static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; u8 rw = 0; u16 miux = 0; u32 idx; int rc = 0; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] && !info->attrs[NFC_ATTR_LLC_PARAM_RW] && !info->attrs[NFC_ATTR_LLC_PARAM_MIUX])) return -EINVAL; if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) { rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]); if (rw > LLCP_MAX_RW) return -EINVAL; } if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) { miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]); if (miux > LLCP_MAX_MIUX) return -EINVAL; } idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) { if (dev->dep_link_up) { rc = -EINPROGRESS; goto put_local; } local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]); } if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) local->rw = rw; if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) local->miux = cpu_to_be16(miux); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1]; u32 idx; u8 tid; char *uri; int rc = 0, rem; size_t uri_len, tlvs_len; struct hlist_head sdreq_list; struct nfc_llcp_sdp_tlv *sdreq; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_LLC_SDP]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (dev->dep_link_up == false) { rc = -ENOLINK; goto exit; } local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } INIT_HLIST_HEAD(&sdreq_list); tlvs_len = 0; nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX, attr, nfc_sdp_genl_policy, info->extack); if (rc != 0) { rc = -EINVAL; goto put_local; } if (!sdp_attrs[NFC_SDP_ATTR_URI]) continue; uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri_len == 0) continue; uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri == NULL || *uri == 0) continue; tid = local->sdreq_next_tid++; sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); if (sdreq == NULL) { rc = -ENOMEM; goto put_local; } tlvs_len += sdreq->tlv_len; hlist_add_head(&sdreq->node, &sdreq_list); } if (hlist_empty(&sdreq_list)) { rc = -EINVAL; goto put_local; } rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); nfc_put_device(dev); return rc; } int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_FW_DOWNLOAD); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_enable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_disable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct nfc_se *se, *n; list_for_each_entry_safe(se, n, &dev->secure_elements, list) { hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_SE); if (!hdr) goto nla_put_failure; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); } return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_ses(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_ses_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (!device_is_registered(&dev->dev)) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->se_io) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state != NFC_SE_ENABLED) { rc = -ENODEV; goto error; } rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); device_unlock(&dev->dev); return rc; error: device_unlock(&dev->dev); kfree(cb_context); return rc; } struct se_io_ctx { u32 dev_idx; u32 se_idx; }; static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err) { struct se_io_ctx *ctx = context; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { kfree(ctx); return; } hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_SE_IO); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) || nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); kfree(ctx); return; nla_put_failure: free_msg: nlmsg_free(msg); kfree(ctx); return; } static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct se_io_ctx *ctx; u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || !info->attrs[NFC_ATTR_SE_APDU]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->ops || !dev->ops->se_io) { rc = -EOPNOTSUPP; goto put_dev; } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); if (apdu_len == 0) { rc = -EINVAL; goto put_dev; } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); if (!apdu) { rc = -EINVAL; goto put_dev; } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto put_dev; } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); put_dev: nfc_put_device(dev); return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; int i, err; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_VENDOR_ID] || !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->vendor_cmds || !dev->n_vendor_cmds) { err = -ENODEV; goto put_dev; } if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); if (data_len == 0) { err = -EINVAL; goto put_dev; } } else { data = NULL; data_len = 0; } for (i = 0; i < dev->n_vendor_cmds; i++) { cmd = &dev->vendor_cmds[i]; if (cmd->vendor_id != vid || cmd->subcmd != subcmd) continue; dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; goto put_dev; } err = -EOPNOTSUPP; put_dev: nfc_put_device(dev); return err; } /* message building helper */ static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); } static struct sk_buff * __nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, u32 portid, u32 seq, enum nfc_attrs attr, u32 oui, u32 subcmd, gfp_t gfp) { struct sk_buff *skb; void *hdr; skb = nlmsg_new(approxlen + 100, gfp); if (!skb) return NULL; hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); if (!hdr) { kfree_skb(skb); return NULL; } if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) goto nla_put_failure; ((void **)skb->cb)[0] = dev; ((void **)skb->cb)[1] = hdr; return skb; nla_put_failure: kfree_skb(skb); return NULL; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen) { if (WARN_ON(!dev->cur_cmd_info)) return NULL; return __nfc_alloc_vendor_cmd_skb(dev, approxlen, dev->cur_cmd_info->snd_portid, dev->cur_cmd_info->snd_seq, attr, oui, subcmd, GFP_KERNEL); } EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); int nfc_vendor_cmd_reply(struct sk_buff *skb) { struct nfc_dev *dev = ((void **)skb->cb)[0]; void *hdr = ((void **)skb->cb)[1]; /* clear CB data for netlink core to own from now on */ memset(skb->cb, 0, sizeof(skb->cb)); if (WARN_ON(!dev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; } genlmsg_end(skb, hdr); return genlmsg_reply(skb, dev->cur_cmd_info); } EXPORT_SYMBOL(nfc_vendor_cmd_reply); static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_get_device, .dumpit = nfc_genl_dump_devices, .done = nfc_genl_dump_devices_done, }, { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, { .cmd = NFC_CMD_LLC_GET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_get_params, }, { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_ses, .done = nfc_genl_dump_ses_done, }, { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family nfc_genl_family __ro_after_init = { .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, .maxattr = NFC_ATTR_MAX, .policy = nfc_genl_policy, .module = THIS_MODULE, .ops = nfc_genl_ops, .n_ops = ARRAY_SIZE(nfc_genl_ops), .resv_start_op = NFC_CMD_DEACTIVATE_TARGET + 1, .mcgrps = nfc_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps), }; struct urelease_work { struct work_struct w; u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) { struct urelease_work *w = container_of(work, struct urelease_work, w); struct class_dev_iter iter; struct nfc_dev *dev; pr_debug("portid %d\n", w->portid); mutex_lock(&nfc_devlist_mutex); nfc_device_iter_init(&iter); dev = nfc_device_iter_next(&iter); while (dev) { mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid == w->portid) { nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; } mutex_unlock(&dev->genl_data.genl_data_mutex); dev = nfc_device_iter_next(&iter); } nfc_device_iter_exit(&iter); mutex_unlock(&nfc_devlist_mutex); kfree(w); } static int nfc_genl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct urelease_work *w; if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC) goto out; pr_debug("NETLINK_URELEASE event from id %d\n", n->portid); w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w) { INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; schedule_work(&w->w); } out: return NOTIFY_DONE; } void nfc_genl_data_init(struct nfc_genl_data *genl_data) { genl_data->poll_req_portid = 0; mutex_init(&genl_data->genl_data_mutex); } void nfc_genl_data_exit(struct nfc_genl_data *genl_data) { mutex_destroy(&genl_data->genl_data_mutex); } static struct notifier_block nl_notifier = { .notifier_call = nfc_genl_rcv_nl_event, }; /** * nfc_genl_init() - Initialize netlink interface * * This initialization function registers the nfc netlink family. */ int __init nfc_genl_init(void) { int rc; rc = genl_register_family(&nfc_genl_family); if (rc) return rc; netlink_register_notifier(&nl_notifier); return 0; } /** * nfc_genl_exit() - Deinitialize netlink interface * * This exit function unregisters the nfc netlink family. */ void nfc_genl_exit(void) { netlink_unregister_notifier(&nl_notifier); genl_unregister_family(&nfc_genl_family); } |
5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Stream Parser * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #ifndef __NET_STRPARSER_H_ #define __NET_STRPARSER_H_ #include <linux/skbuff.h> #include <net/sock.h> #define STRP_STATS_ADD(stat, count) ((stat) += (count)) #define STRP_STATS_INCR(stat) ((stat)++) struct strp_stats { unsigned long long msgs; unsigned long long bytes; unsigned int mem_fail; unsigned int need_more_hdr; unsigned int msg_too_big; unsigned int msg_timeouts; unsigned int bad_hdr_len; }; struct strp_aggr_stats { unsigned long long msgs; unsigned long long bytes; unsigned int mem_fail; unsigned int need_more_hdr; unsigned int msg_too_big; unsigned int msg_timeouts; unsigned int bad_hdr_len; unsigned int aborts; unsigned int interrupted; unsigned int unrecov_intr; }; struct strparser; /* Callbacks are called with lock held for the attached socket */ struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); void (*unlock)(struct strparser *strp); }; struct strp_msg { int full_len; int offset; }; struct _strp_msg { /* Internal cb structure. struct strp_msg must be first for passing * to upper layer. */ struct strp_msg strp; int accum_len; }; struct sk_skb_cb { #define SK_SKB_CB_PRIV_LEN 20 unsigned char data[SK_SKB_CB_PRIV_LEN]; /* align strp on cache line boundary within skb->cb[] */ unsigned char pad[4]; struct _strp_msg strp; /* strp users' data follows */ struct tls_msg { u8 control; } tls; /* temp_reg is a temporary register used for bpf_convert_data_end_access * when dst_reg == src_reg. */ u64 temp_reg; }; static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ struct strparser { struct sock *sk; u32 stopped : 1; u32 paused : 1; u32 aborted : 1; u32 interrupted : 1; u32 unrecov_intr : 1; struct sk_buff **skb_nextp; struct sk_buff *skb_head; unsigned int need_bytes; struct delayed_work msg_timer_work; struct work_struct work; struct strp_stats stats; struct strp_callbacks cb; }; /* Must be called with lock held for attached socket */ static inline void strp_pause(struct strparser *strp) { strp->paused = 1; } /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); /* Must be called with process lock held (lock_sock) */ void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) { /* Save psock statistics in the mux when psock is being unattached. */ #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += \ strp->stats._stat) SAVE_PSOCK_STATS(msgs); SAVE_PSOCK_STATS(bytes); SAVE_PSOCK_STATS(mem_fail); SAVE_PSOCK_STATS(need_more_hdr); SAVE_PSOCK_STATS(msg_too_big); SAVE_PSOCK_STATS(msg_timeouts); SAVE_PSOCK_STATS(bad_hdr_len); #undef SAVE_PSOCK_STATS if (strp->aborted) agg_stats->aborts++; if (strp->interrupted) agg_stats->interrupted++; if (strp->unrecov_intr) agg_stats->unrecov_intr++; } static inline void aggregate_strp_stats(struct strp_aggr_stats *stats, struct strp_aggr_stats *agg_stats) { #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) SAVE_PSOCK_STATS(msgs); SAVE_PSOCK_STATS(bytes); SAVE_PSOCK_STATS(mem_fail); SAVE_PSOCK_STATS(need_more_hdr); SAVE_PSOCK_STATS(msg_too_big); SAVE_PSOCK_STATS(msg_timeouts); SAVE_PSOCK_STATS(bad_hdr_len); SAVE_PSOCK_STATS(aborts); SAVE_PSOCK_STATS(interrupted); SAVE_PSOCK_STATS(unrecov_intr); #undef SAVE_PSOCK_STATS } void strp_done(struct strparser *strp); void strp_stop(struct strparser *strp); void strp_check_rcv(struct strparser *strp); int strp_init(struct strparser *strp, struct sock *sk, const struct strp_callbacks *cb); void strp_data_ready(struct strparser *strp); int strp_process(struct strparser *strp, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len, size_t max_msg_size, long timeo); #endif /* __NET_STRPARSER_H_ */ |
3587 3592 3588 3284 3284 3590 3592 632 632 632 632 631 632 3581 3584 2548 2134 504 3593 3594 3589 2557 3589 3593 9 3584 3283 3593 3286 317 2150 2739 3617 2566 2565 2566 2567 2566 2566 2567 2574 2566 2566 2567 2573 2566 2447 4 4 3 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } return offset; } EXPORT_SYMBOL(vfs_setpos); /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { switch (whence) { case SEEK_END: offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (offset == 0) return file->f_pos; /* * f_lock protects against read/modify/write race with other * SEEK_CURs. Note that parallel writes and reads behave * like SEEK_SET. */ spin_lock(&file->f_lock); offset = vfs_setpos(file, file->f_pos + offset, maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } fdput_pos(f); return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; struct fd f = fdget_pos(fd); loff_t offset; if (!f.file) return -EBADF; retval = -EINVAL; if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } out_putf: fdput_pos(f); return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_read(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_write(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = call_read_iter(filp, &kiocb, iter); else ret = call_write_iter(filp, &kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = call_read_iter(file, iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = call_write_iter(file, iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_readv(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_writev(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ retval = -EBADF; in = fdget(in_fd); if (!in.file) goto out; if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; if (!ppos) { pos = in.file->f_pos; } else { pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; } retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ retval = -EBADF; out = fdget(out_fd); if (!out.file) goto fput_in; if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(out.file, true); if (!opipe) { retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); out.file->f_pos = out_pos; if (ppos) *ppos = pos; else in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; fput_out: fdput(out); fput_in: fdput(in); out: return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; struct fd f_in; struct fd f_out; ssize_t ret = -EBADF; f_in = fdget(fd_in); if (!f_in.file) goto out2; f_out = fdget(fd_out); if (!f_out.file) goto out1; ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) goto out; } else { pos_in = f_in.file->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) goto out; } else { pos_out = f_out.file->f_pos; } ret = -EINVAL; if (flags != 0) goto out; ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { f_in.file->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { f_out.file->f_pos = pos_out; } } out: fdput(f_out); out1: fdput(f_in); out2: return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_mode & FMODE_BUF_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } |
2593 1938 1727 3525 3525 1319 1299 1299 1299 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/backing-dev.h * * low-level device information and state which is propagated up through * to high-level code. */ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/device.h> #include <linux/writeback.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); return bdi; } struct backing_dev_info *bdi_get_by_id(u64 id); void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); void wb_wait_for_completion(struct wb_completion *done); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { return test_bit(WB_has_dirty_io, &wb->state); } static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { /* * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are * any dirty wbs. See wb_update_write_bandwidth(). */ return atomic_long_read(&bdi->tot_write_bandwidth); } static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); } static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_sum_positive(&wb->stat[item]); } extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif } /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) #define BDI_CAP_WRITEBACK_ACCT (1 << 1) #define BDI_CAP_STRICTLIMIT (1 << 2) extern struct backing_dev_info noop_backing_dev_info; int bdi_init(struct backing_dev_info *bdi); /** * writeback_in_progress - determine whether there is writeback in progress * @wb: bdi_writeback of interest * * Determine whether there is writeback waiting to be handled against a * bdi_writeback. */ static inline bool writeback_in_progress(struct bdi_writeback *wb) { return test_bit(WB_writeback_running, &wb->state); } struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * * Cgroup writeback requires support from the filesystem. Also, both memcg and * iocg have to be on the default hierarchy. Test whether all conditions are * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. */ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && (bdi->capabilities & BDI_CAP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest * * Find the wb of @bdi which matches both the memcg and blkcg of %current. * Must be called under rcu_read_lock() which protects the returend wb. * NULL if not found. */ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; memcg_css = task_css(current, memory_cgrp_id); if (!memcg_css->parent) return &bdi->wb; wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); /* * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } /** * wb_get_create_current - get or create wb for %current on a bdi * @bdi: bdi of interest * @gfp: allocation mask * * Equivalent to wb_get_create() on %current's memcg. This function is * called from a relatively hot path and optimizes the common cases using * wb_find_current(). */ static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { struct bdi_writeback *wb; rcu_read_lock(); wb = wb_find_current(bdi); if (wb && unlikely(!wb_tryget(wb))) wb = NULL; rcu_read_unlock(); if (unlikely(!wb)) { struct cgroup_subsys_state *memcg_css; memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, gfp); css_put(memcg_css); } return wb; } /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { /* * If wbc does not have inode attached, it means cgroup writeback was * disabled when wbc started. Just use the default wb in that case. */ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; } /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and * can't sleep during the transaction. IRQs may or may not be disabled on * return. */ static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); /* * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode * @cookie: @cookie from unlocked_inode_to_wb_begin() */ static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { if (unlikely(cookie->locked)) xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) { return false; } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; } static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { return &bdi->wb; } static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { return inode_to_wb(inode); } static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { } static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } #endif /* CONFIG_CGROUP_WRITEBACK */ const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ |
4 4 4 4 4 4 4 4 4 4 4 4 6 7 5 7 7 15 15 1 15 6 3 2 6 6 6 1 1 3 3 2 1 1 3 2 2 19 18 17 14 13 16 1 1 19 19 19 6 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 | /* * Copyright (C) 2017 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/rtnetlink.h> #include <net/pkt_cls.h> #include "netdevsim.h" #define pr_vlog(env, fmt, ...) \ bpf_verifier_log_write(env, "[netdevsim] " fmt, ##__VA_ARGS__) struct nsim_bpf_bound_prog { struct nsim_dev *nsim_dev; struct bpf_prog *prog; struct dentry *ddir; const char *state; bool is_loaded; struct list_head l; }; #define NSIM_BPF_MAX_KEYS 2 struct nsim_bpf_bound_map { struct netdevsim *ns; struct bpf_offloaded_map *map; struct mutex mutex; struct nsim_map_entry { void *key; void *value; } entry[NSIM_BPF_MAX_KEYS]; struct list_head l; }; static int nsim_bpf_string_show(struct seq_file *file, void *data) { const char **str = file->private; if (*str) seq_printf(file, "%s\n", *str); return 0; } DEFINE_SHOW_ATTRIBUTE(nsim_bpf_string); static int nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) { struct nsim_bpf_bound_prog *state; int ret = 0; state = env->prog->aux->offload->dev_priv; if (state->nsim_dev->bpf_bind_verifier_delay && !insn_idx) msleep(state->nsim_dev->bpf_bind_verifier_delay); if (insn_idx == env->prog->len - 1) { pr_vlog(env, "Hello from netdevsim!\n"); if (!state->nsim_dev->bpf_bind_verifier_accept) ret = -EOPNOTSUPP; } return ret; } static int nsim_bpf_finalize(struct bpf_verifier_env *env) { return 0; } static bool nsim_xdp_offload_active(struct netdevsim *ns) { return ns->xdp_hw.prog; } static void nsim_prog_set_loaded(struct bpf_prog *prog, bool loaded) { struct nsim_bpf_bound_prog *state; if (!prog || !bpf_prog_is_offloaded(prog->aux)) return; state = prog->aux->offload->dev_priv; state->is_loaded = loaded; } static int nsim_bpf_offload(struct netdevsim *ns, struct bpf_prog *prog, bool oldprog) { nsim_prog_set_loaded(ns->bpf_offloaded, false); WARN(!!ns->bpf_offloaded != oldprog, "bad offload state, expected offload %sto be active", oldprog ? "" : "not "); ns->bpf_offloaded = prog; ns->bpf_offloaded_id = prog ? prog->aux->id : 0; nsim_prog_set_loaded(prog, true); return 0; } int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct tc_cls_bpf_offload *cls_bpf = type_data; struct bpf_prog *prog = cls_bpf->prog; struct netdevsim *ns = cb_priv; struct bpf_prog *oldprog; if (type != TC_SETUP_CLSBPF) { NSIM_EA(cls_bpf->common.extack, "only offload of BPF classifiers supported"); return -EOPNOTSUPP; } if (!tc_cls_can_offload_and_chain0(ns->netdev, &cls_bpf->common)) return -EOPNOTSUPP; if (cls_bpf->common.protocol != htons(ETH_P_ALL)) { NSIM_EA(cls_bpf->common.extack, "only ETH_P_ALL supported as filter protocol"); return -EOPNOTSUPP; } if (!ns->bpf_tc_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject BPF TC offload"); return -EOPNOTSUPP; } /* Note: progs without skip_sw will probably not be dev bound */ if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject unbound programs"); return -EOPNOTSUPP; } if (cls_bpf->command != TC_CLSBPF_OFFLOAD) return -EOPNOTSUPP; oldprog = cls_bpf->oldprog; /* Don't remove if oldprog doesn't match driver's state */ if (ns->bpf_offloaded != oldprog) { oldprog = NULL; if (!cls_bpf->prog) return 0; if (ns->bpf_offloaded) { NSIM_EA(cls_bpf->common.extack, "driver and netdev offload states mismatch"); return -EBUSY; } } return nsim_bpf_offload(ns, cls_bpf->prog, oldprog); } int nsim_bpf_disable_tc(struct netdevsim *ns) { if (ns->bpf_offloaded && !nsim_xdp_offload_active(ns)) return -EBUSY; return 0; } static int nsim_xdp_offload_prog(struct netdevsim *ns, struct netdev_bpf *bpf) { if (!nsim_xdp_offload_active(ns) && !bpf->prog) return 0; if (!nsim_xdp_offload_active(ns) && bpf->prog && ns->bpf_offloaded) { NSIM_EA(bpf->extack, "TC program is already loaded"); return -EBUSY; } return nsim_bpf_offload(ns, bpf->prog, nsim_xdp_offload_active(ns)); } static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf, struct xdp_attachment_info *xdp) { int err; if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW && !ns->bpf_xdpoffload_accept) { NSIM_EA(bpf->extack, "XDP offload disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW) { err = nsim_xdp_offload_prog(ns, bpf); if (err) return err; } xdp_attachment_setup(xdp, bpf); return 0; } static int nsim_bpf_create_prog(struct nsim_dev *nsim_dev, struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; char name[16]; int ret; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return -ENOMEM; state->nsim_dev = nsim_dev; state->prog = prog; state->state = "verify"; /* Program id is not populated yet when we create the state. */ sprintf(name, "%u", nsim_dev->prog_id_gen++); state->ddir = debugfs_create_dir(name, nsim_dev->ddir_bpf_bound_progs); if (IS_ERR(state->ddir)) { ret = PTR_ERR(state->ddir); kfree(state); return ret; } debugfs_create_u32("id", 0400, state->ddir, &prog->aux->id); debugfs_create_file("state", 0400, state->ddir, &state->state, &nsim_bpf_string_fops); debugfs_create_bool("loaded", 0400, state->ddir, &state->is_loaded); list_add_tail(&state->l, &nsim_dev->bpf_bound_progs); prog->aux->offload->dev_priv = state; return 0; } static int nsim_bpf_verifier_prep(struct bpf_prog *prog) { struct nsim_dev *nsim_dev = bpf_offload_dev_priv(prog->aux->offload->offdev); if (!nsim_dev->bpf_bind_accept) return -EOPNOTSUPP; return nsim_bpf_create_prog(nsim_dev, prog); } static int nsim_bpf_translate(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv; state->state = "xlated"; return 0; } static void nsim_bpf_destroy_prog(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; state = prog->aux->offload->dev_priv; WARN(state->is_loaded, "offload state destroyed while program still bound"); debugfs_remove_recursive(state->ddir); list_del(&state->l); kfree(state); } static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .insn_hook = nsim_bpf_verify_insn, .finalize = nsim_bpf_finalize, .prepare = nsim_bpf_verifier_prep, .translate = nsim_bpf_translate, .destroy = nsim_bpf_destroy_prog, }; static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { if (bpf->prog && bpf->prog->aux->offload) { NSIM_EA(bpf->extack, "attempt to load offloaded prog to drv"); return -EINVAL; } if (ns->netdev->mtu > NSIM_XDP_MAX_MTU) { NSIM_EA(bpf->extack, "MTU too large w/ XDP enabled"); return -EINVAL; } return 0; } static int nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { struct nsim_bpf_bound_prog *state; if (!bpf->prog) return 0; if (!bpf_prog_is_offloaded(bpf->prog->aux)) { NSIM_EA(bpf->extack, "xdpoffload of non-bound program"); return -EINVAL; } state = bpf->prog->aux->offload->dev_priv; if (WARN_ON(strcmp(state->state, "xlated"))) { NSIM_EA(bpf->extack, "offloading program in bad state"); return -EINVAL; } return 0; } static bool nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key) { return e->key && !memcmp(key, e->key, map->key_size); } static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key)) return i; return -ENOENT; } static int nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].key) return -ENOMEM; nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].value) { kfree(nmap->entry[idx].key); nmap->entry[idx].key = NULL; return -ENOMEM; } return 0; } static int nsim_map_get_next_key(struct bpf_offloaded_map *offmap, void *key, void *next_key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx = -ENOENT; mutex_lock(&nmap->mutex); if (key) idx = nsim_map_key_find(offmap, key); if (idx == -ENOENT) idx = 0; else idx++; for (; idx < ARRAY_SIZE(nmap->entry); idx++) { if (nmap->entry[idx].key) { memcpy(next_key, nmap->entry[idx].key, offmap->map.key_size); break; } } mutex_unlock(&nmap->mutex); if (idx == ARRAY_SIZE(nmap->entry)) return -ENOENT; return 0; } static int nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) memcpy(value, nmap->entry[idx].value, offmap->map.value_size); mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static int nsim_map_update_elem(struct bpf_offloaded_map *offmap, void *key, void *value, u64 flags) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx, err = 0; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx < 0 && flags == BPF_EXIST) { err = idx; goto exit_unlock; } if (idx >= 0 && flags == BPF_NOEXIST) { err = -EEXIST; goto exit_unlock; } if (idx < 0) { for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++) if (!nmap->entry[idx].key) break; if (idx == ARRAY_SIZE(nmap->entry)) { err = -E2BIG; goto exit_unlock; } err = nsim_map_alloc_elem(offmap, idx); if (err) goto exit_unlock; } memcpy(nmap->entry[idx].key, key, offmap->map.key_size); memcpy(nmap->entry[idx].value, value, offmap->map.value_size); exit_unlock: mutex_unlock(&nmap->mutex); return err; } static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) return -EINVAL; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) { kfree(nmap->entry[idx].key); kfree(nmap->entry[idx].value); memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx])); } mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static const struct bpf_map_dev_ops nsim_bpf_map_ops = { .map_get_next_key = nsim_map_get_next_key, .map_lookup_elem = nsim_map_lookup_elem, .map_update_elem = nsim_map_update_elem, .map_delete_elem = nsim_map_delete_elem, }; static int nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap; int i, err; if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && offmap->map.map_type != BPF_MAP_TYPE_HASH)) return -EINVAL; if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS) return -ENOMEM; if (offmap->map.map_flags) return -EINVAL; nmap = kzalloc(sizeof(*nmap), GFP_KERNEL_ACCOUNT); if (!nmap) return -ENOMEM; offmap->dev_priv = nmap; nmap->ns = ns; nmap->map = offmap; mutex_init(&nmap->mutex); if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) { for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { u32 *key; err = nsim_map_alloc_elem(offmap, i); if (err) goto err_free; key = nmap->entry[i].key; *key = i; memset(nmap->entry[i].value, 0, offmap->map.value_size); } } offmap->dev_ops = &nsim_bpf_map_ops; list_add_tail(&nmap->l, &ns->nsim_dev->bpf_bound_maps); return 0; err_free: while (--i >= 0) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } kfree(nmap); return err; } static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } list_del_init(&nmap->l); mutex_destroy(&nmap->mutex); kfree(nmap); } int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct netdevsim *ns = netdev_priv(dev); int err; ASSERT_RTNL(); switch (bpf->command) { case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp); case XDP_SETUP_PROG_HW: err = nsim_setup_prog_hw_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp_hw); case BPF_OFFLOAD_MAP_ALLOC: if (!ns->bpf_map_accept) return -EOPNOTSUPP; return nsim_bpf_map_alloc(ns, bpf->offmap); case BPF_OFFLOAD_MAP_FREE: nsim_bpf_map_free(bpf->offmap); return 0; default: return -EINVAL; } } int nsim_bpf_dev_init(struct nsim_dev *nsim_dev) { int err; INIT_LIST_HEAD(&nsim_dev->bpf_bound_progs); INIT_LIST_HEAD(&nsim_dev->bpf_bound_maps); nsim_dev->ddir_bpf_bound_progs = debugfs_create_dir("bpf_bound_progs", nsim_dev->ddir); if (IS_ERR(nsim_dev->ddir_bpf_bound_progs)) return PTR_ERR(nsim_dev->ddir_bpf_bound_progs); nsim_dev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops, nsim_dev); err = PTR_ERR_OR_ZERO(nsim_dev->bpf_dev); if (err) return err; nsim_dev->bpf_bind_accept = true; debugfs_create_bool("bpf_bind_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_accept); debugfs_create_u32("bpf_bind_verifier_delay", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_delay); nsim_dev->bpf_bind_verifier_accept = true; debugfs_create_bool("bpf_bind_verifier_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_accept); return 0; } void nsim_bpf_dev_exit(struct nsim_dev *nsim_dev) { WARN_ON(!list_empty(&nsim_dev->bpf_bound_progs)); WARN_ON(!list_empty(&nsim_dev->bpf_bound_maps)); bpf_offload_dev_destroy(nsim_dev->bpf_dev); } int nsim_bpf_init(struct netdevsim *ns) { struct dentry *ddir = ns->nsim_dev_port->ddir; int err; err = bpf_offload_dev_netdev_register(ns->nsim_dev->bpf_dev, ns->netdev); if (err) return err; debugfs_create_u32("bpf_offloaded_id", 0400, ddir, &ns->bpf_offloaded_id); ns->bpf_tc_accept = true; debugfs_create_bool("bpf_tc_accept", 0600, ddir, &ns->bpf_tc_accept); debugfs_create_bool("bpf_tc_non_bound_accept", 0600, ddir, &ns->bpf_tc_non_bound_accept); ns->bpf_xdpdrv_accept = true; debugfs_create_bool("bpf_xdpdrv_accept", 0600, ddir, &ns->bpf_xdpdrv_accept); ns->bpf_xdpoffload_accept = true; debugfs_create_bool("bpf_xdpoffload_accept", 0600, ddir, &ns->bpf_xdpoffload_accept); ns->bpf_map_accept = true; debugfs_create_bool("bpf_map_accept", 0600, ddir, &ns->bpf_map_accept); return 0; } void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); bpf_offload_dev_netdev_unregister(ns->nsim_dev->bpf_dev, ns->netdev); } |
36 38 37 44 50 49 172 44 171 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_UDP_TUNNEL_H #define __NET_UDP_TUNNEL_H #include <net/ip_tunnels.h> #include <net/udp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ipv6_stubs.h> #endif struct udp_port_cfg { u8 family; /* Used only for kernel-created sockets */ union { struct in_addr local_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr local_ip6; #endif }; union { struct in_addr peer_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr peer_ip6; #endif }; __be16 local_udp_port; __be16 peer_udp_port; int bind_ifindex; unsigned int use_udp_checksums:1, use_udp6_tx_checksums:1, use_udp6_rx_checksums:1, ipv6_v6only:1; }; int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp); #if IS_ENABLED(CONFIG_IPV6) int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp); #else static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { return 0; } #endif static inline int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { if (cfg->family == AF_INET) return udp_sock_create4(net, cfg, sockp); if (cfg->family == AF_INET6) return udp_sock_create6(net, cfg, sockp); return -EPFNOSUPPORT; } typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, struct sk_buff *skb); typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb, int nhoff); struct udp_tunnel_sock_cfg { void *sk_user_data; /* user data used by encap_rcv call back */ /* Used for setting up udp_sock fields, see udp.h for details */ __u8 encap_type; udp_tunnel_encap_rcv_t encap_rcv; udp_tunnel_encap_err_lookup_t encap_err_lookup; udp_tunnel_encap_err_rcv_t encap_err_rcv; udp_tunnel_encap_destroy_t encap_destroy; udp_tunnel_gro_receive_t gro_receive; udp_tunnel_gro_complete_t gro_complete; }; /* Setup the given (UDP) sock to receive UDP encapsulated packets */ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *sock_cfg); /* -- List of parsable UDP tunnel types -- * * Adding to this list will result in serious debate. The main issue is * that this list is essentially a list of workarounds for either poorly * designed tunnels, or poorly designed device offloads. * * The parsing supported via these types should really be used for Rx * traffic only as the network stack will have already inserted offsets for * the location of the headers in the skb. In addition any ports that are * pushed should be kept within the namespace without leaking to other * devices such as VFs or other ports on the same device. * * It is strongly encouraged to use CHECKSUM_COMPLETE for Rx to avoid the * need to use this for Rx checksum offload. It should not be necessary to * call this function to perform Tx offloads on outgoing traffic. */ enum udp_parsable_tunnel_type { UDP_TUNNEL_TYPE_VXLAN = BIT(0), /* RFC 7348 */ UDP_TUNNEL_TYPE_GENEVE = BIT(1), /* draft-ietf-nvo3-geneve */ UDP_TUNNEL_TYPE_VXLAN_GPE = BIT(2), /* draft-ietf-nvo3-vxlan-gpe */ }; struct udp_tunnel_info { unsigned short type; sa_family_t sa_family; __be16 port; u8 hw_priv; }; /* Notify network devices of offloadable types */ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type); void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, unsigned short type); void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); static inline void udp_tunnel_get_rx_info(struct net_device *dev) { ASSERT_RTNL(); if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); } static inline void udp_tunnel_drop_rx_info(struct net_device *dev) { ASSERT_RTNL(); if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); } /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck); int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck); void udp_tunnel_sock_release(struct socket *sock); struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, int oif, __be32 *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 tos, struct dst_cache *dst_cache); struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, int oif, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, struct dst_cache *dst_cache); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, __be16 flags, __be64 tunnel_id, int md_size); #ifdef CONFIG_INET static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; return iptunnel_handle_offloads(skb, type); } #endif static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) return; #if IS_ENABLED(CONFIG_IPV6) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif udp_encap_enable(); } #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { /* Device callbacks may sleep */ UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0), /* Device only supports offloads when it's open, all ports * will be removed before close and re-added after open. */ UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1), /* Device supports only IPv4 tunnels */ UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2), /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN. * This port must not be counted towards n_entries of any table. * Driver will not receive any callback associated with port 4789. */ UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3), }; struct udp_tunnel_nic; #define UDP_TUNNEL_NIC_MAX_SHARING_DEVICES (U16_MAX / 2) struct udp_tunnel_nic_shared { struct udp_tunnel_nic *udp_tunnel_nic_info; struct list_head devices; }; struct udp_tunnel_nic_shared_node { struct net_device *dev; struct list_head list; }; /** * struct udp_tunnel_nic_info - driver UDP tunnel offload information * @set_port: callback for adding a new port * @unset_port: callback for removing a port * @sync_table: callback for syncing the entire port table at once * @shared: reference to device global state (optional) * @flags: device flags from enum udp_tunnel_nic_info_flags * @tables: UDP port tables this device has * @tables.n_entries: number of entries in this table * @tables.tunnel_types: types of tunnels this table accepts * * Drivers are expected to provide either @set_port and @unset_port callbacks * or the @sync_table callback. Callbacks are invoked with rtnl lock held. * * Devices which (misguidedly) share the UDP tunnel port table across multiple * netdevs should allocate an instance of struct udp_tunnel_nic_shared and * point @shared at it. * There must never be more than %UDP_TUNNEL_NIC_MAX_SHARING_DEVICES devices * sharing a table. * * Known limitations: * - UDP tunnel port notifications are fundamentally best-effort - * it is likely the driver will both see skbs which use a UDP tunnel port, * while not being a tunneled skb, and tunnel skbs from other ports - * drivers should only use these ports for non-critical RX-side offloads, * e.g. the checksum offload; * - none of the devices care about the socket family at present, so we don't * track it. Please extend this code if you care. */ struct udp_tunnel_nic_info { /* one-by-one */ int (*set_port)(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti); int (*unset_port)(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti); /* all at once */ int (*sync_table)(struct net_device *dev, unsigned int table); struct udp_tunnel_nic_shared *shared; unsigned int flags; struct udp_tunnel_nic_table_info { unsigned int n_entries; unsigned int tunnel_types; } tables[UDP_TUNNEL_NIC_MAX_TABLES]; }; /* UDP tunnel module dependencies * * Tunnel drivers are expected to have a hard dependency on the udp_tunnel * module. NIC drivers are not, they just attach their * struct udp_tunnel_nic_info to the netdev and wait for callbacks to come. * Loading a tunnel driver will cause the udp_tunnel module to be loaded * and only then will all the required state structures be allocated. * Since we want a weak dependency from the drivers and the core to udp_tunnel * we call things through the following stubs. */ struct udp_tunnel_nic_ops { void (*get_port)(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti); void (*set_port_priv)(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv); void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*reset_ntf)(struct net_device *dev); size_t (*dump_size)(struct net_device *dev, unsigned int table); int (*dump_write)(struct net_device *dev, unsigned int table, struct sk_buff *skb); }; #ifdef CONFIG_INET extern const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops; #else #define udp_tunnel_nic_ops ((struct udp_tunnel_nic_ops *)NULL) #endif static inline void udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { /* This helper is used from .sync_table, we indicate empty entries * by zero'ed @ti. Drivers which need to know the details of a port * when it gets deleted should use the .set_port / .unset_port * callbacks. * Zero out here, otherwise !CONFIG_INET causes uninitilized warnings. */ memset(ti, 0, sizeof(*ti)); if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->get_port(dev, table, idx, ti); } static inline void udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); } static inline void udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->add_port(dev, ti); } static inline void udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->del_port(dev, ti); } /** * udp_tunnel_nic_reset_ntf() - device-originating reset notification * @dev: network interface device structure * * Called by the driver to inform the core that the entire UDP tunnel port * state has been lost, usually due to device reset. Core will assume device * forgot all the ports and issue .set_port and .sync_table callbacks as * necessary. * * This function must be called with rtnl lock held, and will issue all * the callbacks before returning. */ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) { if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->reset_ntf(dev); } static inline size_t udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { if (!udp_tunnel_nic_ops) return 0; return udp_tunnel_nic_ops->dump_size(dev, table); } static inline int udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { if (!udp_tunnel_nic_ops) return 0; return udp_tunnel_nic_ops->dump_write(dev, table, skb); } #endif |
45 10 2 1 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> #include <asm/cpufeature.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ #define get_debugreg(var, register) \ (var) = native_get_debugreg(register) #define set_debugreg(value, register) \ native_set_debugreg(register, value) #endif static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ switch (regno) { case 0: asm("mov %%db0, %0" :"=r" (val)); break; case 1: asm("mov %%db1, %0" :"=r" (val)); break; case 2: asm("mov %%db2, %0" :"=r" (val)); break; case 3: asm("mov %%db3, %0" :"=r" (val)); break; case 6: asm("mov %%db6, %0" :"=r" (val)); break; case 7: /* * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception * when running under SEV-ES. Taking a #VC exception is not a * safe thing to do just anywhere in the entry code and * re-ordering might place the access into an unsafe location. * * This happened in the NMI handler, where the DR7 read was * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); } return val; } static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: asm("mov %0, %%db0" ::"r" (value)); break; case 1: asm("mov %0, %%db1" ::"r" (value)); break; case 2: asm("mov %0, %%db2" ::"r" (value)); break; case 3: asm("mov %0, %%db3" ::"r" (value)); break; case 6: asm("mov %0, %%db6" ::"r" (value)); break; case 7: /* * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the * __FORCE_ORDER here too to avoid similar problems in the * future. */ asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); } } static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ set_debugreg(0UL, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); set_debugreg(0UL, 1); set_debugreg(0UL, 2); set_debugreg(0UL, 3); } static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) { unsigned long dr7; if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) return 0; get_debugreg(dr7, 7); dr7 &= ~0x400; /* architecturally set bit */ if (dr7) set_debugreg(0, 7); /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not * be good. */ barrier(); return dr7; } static __always_inline void local_db_restore(unsigned long dr7) { /* * Ensure the compiler doesn't raise this statement into * the critical section; enabling breakpoints early would * not be good. */ barrier(); if (dr7) set_debugreg(dr7, 7); } #ifdef CONFIG_CPU_SUP_AMD extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) { return 0; } #endif #endif /* _ASM_X86_DEBUGREG_H */ |
54 54 54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COOKIE_H #define __LINUX_COOKIE_H #include <linux/atomic.h> #include <linux/percpu.h> #include <asm/local.h> struct pcpu_gen_cookie { local_t nesting; u64 last; } __aligned(16); struct gen_cookie { struct pcpu_gen_cookie __percpu *local; atomic64_t forward_last ____cacheline_aligned_in_smp; atomic64_t reverse_last; }; #define COOKIE_LOCAL_BATCH 4096 #define DEFINE_COOKIE(name) \ static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \ static struct gen_cookie name = { \ .local = &__##name, \ .forward_last = ATOMIC64_INIT(0), \ .reverse_last = ATOMIC64_INIT(0), \ } static __always_inline u64 gen_cookie_next(struct gen_cookie *gc) { struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local); u64 val; if (likely(local_inc_return(&local->nesting) == 1)) { val = local->last; if (__is_defined(CONFIG_SMP) && unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) { s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH, &gc->forward_last); val = next - COOKIE_LOCAL_BATCH; } local->last = ++val; } else { val = atomic64_dec_return(&gc->reverse_last); } local_dec(&local->nesting); return val; } #endif /* __LINUX_COOKIE_H */ |
14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #include "gateway_client.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/udp.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "routing.h" #include "soft-interface.h" #include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp * packet starting at the beginning of the dhcp header */ #define BATADV_DHCP_HTYPE_OFFSET 1 #define BATADV_DHCP_HLEN_OFFSET 2 /* Value of htype representing Ethernet */ #define BATADV_DHCP_HTYPE_ETHERNET 0x01 /* This is the offset of the "chaddr" field in the dhcp packet starting at the * beginning of the dhcp header */ #define BATADV_DHCP_CHADDR_OFFSET 28 /** * batadv_gw_node_release() - release gw_node from lists and queue for free * after rcu grace period * @ref: kref pointer of the gw_node */ void batadv_gw_node_release(struct kref *ref) { struct batadv_gw_node *gw_node; gw_node = container_of(ref, struct batadv_gw_node, refcount); batadv_orig_node_put(gw_node->orig_node); kfree_rcu(gw_node, rcu); } /** * batadv_gw_get_selected_gw_node() - Get currently selected gateway * @bat_priv: the bat priv with all the soft interface information * * Return: selected gateway (with increased refcnt), NULL on errors */ struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; rcu_read_lock(); gw_node = rcu_dereference(bat_priv->gw.curr_gw); if (!gw_node) goto out; if (!kref_get_unless_zero(&gw_node->refcount)) gw_node = NULL; out: rcu_read_unlock(); return gw_node; } /** * batadv_gw_get_selected_orig() - Get originator of currently selected gateway * @bat_priv: the bat priv with all the soft interface information * * Return: orig_node of selected gateway (with increased refcnt), NULL on errors */ struct batadv_orig_node * batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; struct batadv_orig_node *orig_node = NULL; gw_node = batadv_gw_get_selected_gw_node(bat_priv); if (!gw_node) goto out; rcu_read_lock(); orig_node = gw_node->orig_node; if (!orig_node) goto unlock; if (!kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; unlock: rcu_read_unlock(); out: batadv_gw_node_put(gw_node); return orig_node; } static void batadv_gw_select(struct batadv_priv *bat_priv, struct batadv_gw_node *new_gw_node) { struct batadv_gw_node *curr_gw_node; spin_lock_bh(&bat_priv->gw.list_lock); if (new_gw_node) kref_get(&new_gw_node->refcount); curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node, true); batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } /** * batadv_gw_reselect() - force a gateway reselection * @bat_priv: the bat priv with all the soft interface information * * Set a flag to remind the GW component to perform a new gateway reselection. * However this function does not ensure that the current gateway is going to be * deselected. The reselection mechanism may elect the same gateway once again. * * This means that invoking batadv_gw_reselect() does not guarantee a gateway * change and therefore a uevent is not necessarily expected. */ void batadv_gw_reselect(struct batadv_priv *bat_priv) { atomic_set(&bat_priv->gw.reselect, 1); } /** * batadv_gw_check_client_stop() - check if client mode has been switched off * @bat_priv: the bat priv with all the soft interface information * * This function assumes the caller has checked that the gw state *is actually * changing*. This function is not supposed to be called when there is no state * change. */ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw; if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) return; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!curr_gw) return; /* deselect the current gateway so that next time that client mode is * enabled a proper GW_ADD event can be sent */ batadv_gw_select(bat_priv, NULL); /* if batman-adv is switching the gw client mode off and a gateway was * already selected, send a DEL uevent */ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); batadv_gw_node_put(curr_gw); } /** * batadv_gw_election() - Elect the best gateway * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_election(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw = NULL; struct batadv_gw_node *next_gw = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) goto out; if (!bat_priv->algo_ops->gw.get_best_gw_node) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw) goto out; /* if gw.reselect is set to 1 it means that a previous call to * gw.is_eligible() said that we have a new best GW, therefore it can * now be picked from the list and selected */ next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv); if (curr_gw == next_gw) goto out; if (next_gw) { sprintf(gw_addr, "%pM", next_gw->orig_node->orig); router = batadv_orig_router_get(next_gw->orig_node, BATADV_IF_DEFAULT); if (!router) { batadv_gw_reselect(bat_priv); goto out; } router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) { batadv_gw_reselect(bat_priv); goto out; } } if (curr_gw && !next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Removing selected gateway - no gateway in range\n"); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); } else if (!curr_gw && next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, next_gw->bandwidth_up % 10, router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_ADD, gw_addr); } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Changing route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, next_gw->bandwidth_up % 10, router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_CHANGE, gw_addr); } batadv_gw_select(bat_priv, next_gw); out: batadv_gw_node_put(curr_gw); batadv_gw_node_put(next_gw); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); } /** * batadv_gw_check_election() - Elect orig node as best gateway when eligible * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked */ void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node *curr_gw_orig; /* abort immediately if the routing algorithm does not support gateway * election */ if (!bat_priv->algo_ops->gw.is_eligible) return; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) goto reselect; /* this node already is the gateway */ if (curr_gw_orig == orig_node) goto out; if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig, orig_node)) goto out; reselect: batadv_gw_reselect(bat_priv); out: batadv_orig_node_put(curr_gw_orig); } /** * batadv_gw_node_add() - add gateway node to list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information * * Has to be called with the appropriate locks being acquired * (gw.list_lock). */ static void batadv_gw_node_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway) { struct batadv_gw_node *gw_node; lockdep_assert_held(&bat_priv->gw.list_lock); if (gateway->bandwidth_down == 0) return; gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); if (!gw_node) return; kref_init(&gw_node->refcount); INIT_HLIST_NODE(&gw_node->list); kref_get(&orig_node->refcount); gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); bat_priv->gw.generation++; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", orig_node->orig, ntohl(gateway->bandwidth_down) / 10, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); /* don't return reference to new gw_node */ batadv_gw_node_put(gw_node); } /** * batadv_gw_node_get() - retrieve gateway node from list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * * Return: gateway node if found or NULL otherwise. */ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_gw_node *gw_node_tmp, *gw_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.gateway_list, list) { if (gw_node_tmp->orig_node != orig_node) continue; if (!kref_get_unless_zero(&gw_node_tmp->refcount)) continue; gw_node = gw_node_tmp; break; } rcu_read_unlock(); return gw_node; } /** * batadv_gw_node_update() - update list of available gateways with changed * bandwidth information * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information */ void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway) { struct batadv_gw_node *gw_node, *curr_gw = NULL; spin_lock_bh(&bat_priv->gw.list_lock); gw_node = batadv_gw_node_get(bat_priv, orig_node); if (!gw_node) { batadv_gw_node_add(bat_priv, orig_node, gateway); spin_unlock_bh(&bat_priv->gw.list_lock); goto out; } spin_unlock_bh(&bat_priv->gw.list_lock); if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway bandwidth of originator %pM changed from %u.%u/%u.%u MBit to %u.%u/%u.%u MBit\n", orig_node->orig, gw_node->bandwidth_down / 10, gw_node->bandwidth_down % 10, gw_node->bandwidth_up / 10, gw_node->bandwidth_up % 10, ntohl(gateway->bandwidth_down) / 10, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); if (ntohl(gateway->bandwidth_down) == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway %pM removed from gateway list\n", orig_node->orig); /* Note: We don't need a NULL check here, since curr_gw never * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); if (!hlist_unhashed(&gw_node->list)) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); batadv_gw_node_put(curr_gw); } out: batadv_gw_node_put(gw_node); } /** * batadv_gw_node_delete() - Remove orig_node from gateway list * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is currently in process of being removed */ void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tvlv_gateway_data gateway; gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; batadv_gw_node_update(bat_priv, orig_node, &gateway); } /** * batadv_gw_node_free() - Free gateway information from soft interface * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_node_free(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; spin_lock_bh(&bat_priv->gw.list_lock); hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); } /** * batadv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * * Return: Error code, or length of message */ int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_priv *bat_priv; int ifindex; int ret; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } if (!bat_priv->algo_ops->gw.dump) { ret = -EOPNOTSUPP; goto out; } bat_priv->algo_ops->gw.dump(msg, cb, bat_priv); ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); return ret; } /** * batadv_gw_dhcp_recipient_get() - check if a packet is a DHCP message * @skb: the packet to check * @header_len: a pointer to the batman-adv header size * @chaddr: buffer where the client address will be stored. Valid * only if the function returns BATADV_DHCP_TO_CLIENT * * This function may re-allocate the data buffer of the skb passed as argument. * * Return: * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error * while parsing it * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, u8 *chaddr) { enum batadv_dhcp_recipient ret = BATADV_DHCP_NO; struct ethhdr *ethhdr; struct iphdr *iphdr; struct ipv6hdr *ipv6hdr; struct udphdr *udphdr; struct vlan_ethhdr *vhdr; int chaddr_offset; __be16 proto; u8 *p; /* check for ethernet header */ if (!pskb_may_pull(skb, *header_len + ETH_HLEN)) return BATADV_DHCP_NO; ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; *header_len += ETH_HLEN; /* check for initial vlan header */ if (proto == htons(ETH_P_8021Q)) { if (!pskb_may_pull(skb, *header_len + VLAN_HLEN)) return BATADV_DHCP_NO; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; *header_len += VLAN_HLEN; } /* check for ip header */ switch (proto) { case htons(ETH_P_IP): if (!pskb_may_pull(skb, *header_len + sizeof(*iphdr))) return BATADV_DHCP_NO; iphdr = (struct iphdr *)(skb->data + *header_len); *header_len += iphdr->ihl * 4; /* check for udp header */ if (iphdr->protocol != IPPROTO_UDP) return BATADV_DHCP_NO; break; case htons(ETH_P_IPV6): if (!pskb_may_pull(skb, *header_len + sizeof(*ipv6hdr))) return BATADV_DHCP_NO; ipv6hdr = (struct ipv6hdr *)(skb->data + *header_len); *header_len += sizeof(*ipv6hdr); /* check for udp header */ if (ipv6hdr->nexthdr != IPPROTO_UDP) return BATADV_DHCP_NO; break; default: return BATADV_DHCP_NO; } if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return BATADV_DHCP_NO; udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); /* check for bootp port */ switch (proto) { case htons(ETH_P_IP): if (udphdr->dest == htons(67)) ret = BATADV_DHCP_TO_SERVER; else if (udphdr->source == htons(67)) ret = BATADV_DHCP_TO_CLIENT; break; case htons(ETH_P_IPV6): if (udphdr->dest == htons(547)) ret = BATADV_DHCP_TO_SERVER; else if (udphdr->source == htons(547)) ret = BATADV_DHCP_TO_CLIENT; break; } chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET; /* store the client address if the message is going to a client */ if (ret == BATADV_DHCP_TO_CLIENT) { if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) return BATADV_DHCP_NO; /* check if the DHCP packet carries an Ethernet DHCP */ p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET; if (*p != BATADV_DHCP_HTYPE_ETHERNET) return BATADV_DHCP_NO; /* check if the DHCP packet carries a valid Ethernet address */ p = skb->data + *header_len + BATADV_DHCP_HLEN_OFFSET; if (*p != ETH_ALEN) return BATADV_DHCP_NO; ether_addr_copy(chaddr, skb->data + chaddr_offset); } return ret; } /** * batadv_gw_out_of_range() - check if the dhcp request destination is the best * gateway * @bat_priv: the bat priv with all the soft interface information * @skb: the outgoing packet * * Check if the skb is a DHCP request and if it is sent to the current best GW * server. Due to topology changes it may be the case that the GW server * previously selected is not the best one anymore. * * This call might reallocate skb data. * Must be invoked only when the DHCP packet is going TO a DHCP SERVER. * * Return: true if the packet destination is unicast and it is not the best gw, * false otherwise. */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_neigh_node *neigh_curr = NULL; struct batadv_neigh_node *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; struct batadv_gw_node *gw_node = NULL; struct batadv_gw_node *curr_gw = NULL; struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; struct ethhdr *ethhdr = (struct ethhdr *)skb->data; bool out_of_range = false; u8 curr_tq_avg; unsigned short vid; vid = batadv_get_vid(skb, 0); if (is_multicast_ether_addr(ethhdr->h_dest)) goto out; orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid); if (!orig_dst_node) goto out; gw_node = batadv_gw_node_get(bat_priv, orig_dst_node); if (!gw_node) goto out; switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_SERVER: /* If we are a GW then we are our best GW. We can artificially * set the tq towards ourself as the maximum value */ curr_tq_avg = BATADV_TQ_MAX_VALUE; break; case BATADV_GW_MODE_CLIENT: curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!curr_gw) goto out; /* packet is going to our gateway */ if (curr_gw->orig_node == orig_dst_node) goto out; /* If the dhcp packet has been sent to a different gw, * we have to evaluate whether the old gw is still * reliable enough */ neigh_curr = batadv_find_router(bat_priv, curr_gw->orig_node, NULL); if (!neigh_curr) goto out; curr_ifinfo = batadv_neigh_ifinfo_get(neigh_curr, BATADV_IF_DEFAULT); if (!curr_ifinfo) goto out; curr_tq_avg = curr_ifinfo->bat_iv.tq_avg; batadv_neigh_ifinfo_put(curr_ifinfo); break; case BATADV_GW_MODE_OFF: default: goto out; } neigh_old = batadv_find_router(bat_priv, orig_dst_node, NULL); if (!neigh_old) goto out; old_ifinfo = batadv_neigh_ifinfo_get(neigh_old, BATADV_IF_DEFAULT); if (!old_ifinfo) goto out; if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD) out_of_range = true; batadv_neigh_ifinfo_put(old_ifinfo); out: batadv_orig_node_put(orig_dst_node); batadv_gw_node_put(curr_gw); batadv_gw_node_put(gw_node); batadv_neigh_node_put(neigh_old); batadv_neigh_node_put(neigh_curr); return out_of_range; } |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 | // SPDX-License-Identifier: GPL-2.0-only /* * IEEE802154.4 socket interface * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> */ #include <linux/net.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/if.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <net/datalink.h> #include <net/psnap.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/route.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> /* Utility function for families */ static struct net_device* ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) { struct net_device *dev = NULL; struct net_device *tmp; __le16 pan_id, short_addr; u8 hwaddr[IEEE802154_ADDR_LEN]; switch (addr->mode) { case IEEE802154_ADDR_LONG: ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr); rcu_read_lock(); dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr); dev_hold(dev); rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: if (addr->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) break; rtnl_lock(); for_each_netdev(net, tmp) { if (tmp->type != ARPHRD_IEEE802154) continue; pan_id = tmp->ieee802154_ptr->pan_id; short_addr = tmp->ieee802154_ptr->short_addr; if (pan_id == addr->pan_id && short_addr == addr->short_addr) { dev = tmp; dev_hold(dev); break; } } rtnl_unlock(); break; default: pr_warn("Unsupported ieee802154 address type: %d\n", addr->mode); break; } return dev; } static int ieee802154_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { sock->sk = NULL; sk->sk_prot->close(sk, 0); } return 0; } static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; return sk->sk_prot->sendmsg(sk, msg, len); } static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; if (sk->sk_prot->bind) return sk->sk_prot->bind(sk, uaddr, addr_len); return sock_no_bind(sock, uaddr, addr_len); } static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); return sk->sk_prot->connect(sk, uaddr, addr_len); } static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, unsigned int cmd) { struct ifreq ifr; int ret = -ENOIOCTLCMD; struct net_device *dev; if (get_user_ifreq(&ifr, NULL, arg)) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; dev_load(sock_net(sk), ifr.ifr_name); dev = dev_get_by_name(sock_net(sk), ifr.ifr_name); if (!dev) return -ENODEV; if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); if (!ret && put_user_ifreq(&ifr, arg)) ret = -EFAULT; dev_put(dev); return ret; } static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCGIFADDR: case SIOCSIFADDR: return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg, cmd); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; return sk_ioctl(sk, cmd, (void __user *)arg); } } /* RAW Sockets (802.15.4 created in userspace) */ static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); write_unlock_bh(&raw_lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } static void raw_unhash(struct sock *sk) { write_lock_bh(&raw_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&raw_lock); } static void raw_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) { struct ieee802154_addr addr; struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr; int err = 0; struct net_device *dev = NULL; err = ieee802154_sockaddr_check_size(uaddr, len); if (err < 0) return err; uaddr = (struct sockaddr_ieee802154 *)_uaddr; if (uaddr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); ieee802154_addr_from_sa(&addr, &uaddr->addr); dev = ieee802154_get_dev(sock_net(sk), &addr); if (!dev) { err = -ENODEV; goto out; } sk->sk_bound_dev_if = dev->ifindex; sk_dst_reset(sk); dev_put(dev); out: release_sock(sk); return err; } static int raw_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { return -ENOTSUPP; } static int raw_disconnect(struct sock *sk, int flags) { return 0; } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } lock_sock(sk); if (!sk->sk_bound_dev_if) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if); release_sock(sk); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } if (!size) { err = 0; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_mac_header(skb); skb_reset_network_header(skb); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk; read_lock(&raw_lock); sk_for_each(sk, &raw_head) { bh_lock_sock(sk); if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) raw_rcv_skb(sk, clone); } bh_unlock_sock(sk); } read_unlock(&raw_lock); } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } static int raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { return -EOPNOTSUPP; } static struct proto ieee802154_raw_prot = { .name = "IEEE-802.15.4-RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), .close = raw_close, .bind = raw_bind, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .hash = raw_hash, .unhash = raw_unhash, .connect = raw_connect, .disconnect = raw_disconnect, .getsockopt = raw_getsockopt, .setsockopt = raw_setsockopt, }; static const struct proto_ops ieee802154_raw_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; /* DGRAM Sockets (802.15.4 dataframes) */ static HLIST_HEAD(dgram_head); static DEFINE_RWLOCK(dgram_lock); struct dgram_sock { struct sock sk; struct ieee802154_addr src_addr; struct ieee802154_addr dst_addr; unsigned int bound:1; unsigned int connected:1; unsigned int want_ack:1; unsigned int want_lqi:1; unsigned int secen:1; unsigned int secen_override:1; unsigned int seclevel:3; unsigned int seclevel_override:1; }; static inline struct dgram_sock *dgram_sk(const struct sock *sk) { return container_of(sk, struct dgram_sock, sk); } static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); write_unlock_bh(&dgram_lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } static void dgram_unhash(struct sock *sk) { write_lock_bh(&dgram_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&dgram_lock); } static int dgram_init(struct sock *sk) { struct dgram_sock *ro = dgram_sk(sk); ro->want_ack = 1; ro->want_lqi = 0; return 0; } static void dgram_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct ieee802154_addr haddr; struct dgram_sock *ro = dgram_sk(sk); int err = -EINVAL; struct net_device *dev; lock_sock(sk); ro->bound = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) goto out; if (addr->family != AF_IEEE802154) { err = -EINVAL; goto out; } ieee802154_addr_from_sa(&haddr, &addr->addr); dev = ieee802154_get_dev(sock_net(sk), &haddr); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_IEEE802154) { err = -ENODEV; goto out_put; } ro->src_addr = haddr; ro->bound = 1; err = 0; out_put: dev_put(dev); out: release_sock(sk); return err; } static int dgram_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; *karg = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { /* We will only return the amount * of this packet since that is all * that will be read. */ *karg = skb->len - ieee802154_hdr_length(skb); } spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } } return -ENOIOCTLCMD; } /* FIXME: autobind */ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct dgram_sock *ro = dgram_sk(sk); int err = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) return err; if (addr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); if (!ro->bound) { err = -ENETUNREACH; goto out; } ieee802154_addr_from_sa(&ro->dst_addr, &addr->addr); ro->connected = 1; out: release_sock(sk); return err; } static int dgram_disconnect(struct sock *sk, int flags) { struct dgram_sock *ro = dgram_sk(sk); lock_sock(sk); ro->connected = 0; release_sock(sk); return 0; } static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); struct ieee802154_addr dst_addr; DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } if (msg->msg_name) { if (ro->connected) return -EISCONN; if (msg->msg_namelen < IEEE802154_MIN_NAMELEN) return -EINVAL; err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen); if (err < 0) return err; ieee802154_addr_from_sa(&dst_addr, &daddr->addr); } else { if (!ro->connected) return -EDESTADDRREQ; dst_addr = ro->dst_addr; } if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_network_header(skb); cb = mac_cb_init(skb); cb->type = IEEE802154_FC_TYPE_DATA; cb->ackreq = ro->want_ack; cb->secen = ro->secen; cb->secen_override = ro->secen_override; cb->seclevel = ro->seclevel; cb->seclevel_override = ro->seclevel_override; err = wpan_dev_hard_header(skb, dev, &dst_addr, ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; struct dgram_sock *ro = dgram_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* FIXME: skip headers if necessary ?! */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); if (saddr) { /* Clear the implicit padding in struct sockaddr_ieee802154 * (16 bits between 'family' and 'addr') and in struct * ieee802154_addr_sa (16 bits at the end of the structure). */ memset(saddr, 0, sizeof(*saddr)); saddr->family = AF_IEEE802154; ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); *addr_len = sizeof(*saddr); } if (ro->want_lqi) { err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI, sizeof(uint8_t), &(mac_cb(skb)->lqi)); if (err) goto done; } if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static inline bool ieee802154_match_sock(__le64 hw_addr, __le16 pan_id, __le16 short_addr, struct dgram_sock *ro) { if (!ro->bound) return true; if (ro->src_addr.mode == IEEE802154_ADDR_LONG && hw_addr == ro->src_addr.extended_addr) return true; if (ro->src_addr.mode == IEEE802154_ADDR_SHORT && pan_id == ro->src_addr.pan_id && short_addr == ro->src_addr.short_addr) return true; return false; } static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk, *prev = NULL; int ret = NET_RX_SUCCESS; __le16 pan_id, short_addr; __le64 hw_addr; /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); pan_id = dev->ieee802154_ptr->pan_id; short_addr = dev->ieee802154_ptr->short_addr; hw_addr = dev->ieee802154_ptr->extended_addr; read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { if (ieee802154_match_sock(hw_addr, pan_id, short_addr, dgram_sk(sk))) { if (prev) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) dgram_rcv_skb(prev, clone); } prev = sk; } } if (prev) { dgram_rcv_skb(prev, skb); } else { kfree_skb(skb); ret = NET_RX_DROP; } read_unlock(&dgram_lock); return ret; } static int dgram_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dgram_sock *ro = dgram_sk(sk); int val, len; if (level != SOL_IEEE802154) return -EOPNOTSUPP; if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case WPAN_WANTACK: val = ro->want_ack; break; case WPAN_WANTLQI: val = ro->want_lqi; break; case WPAN_SECURITY: if (!ro->secen_override) val = WPAN_SECURITY_DEFAULT; else if (ro->secen) val = WPAN_SECURITY_ON; else val = WPAN_SECURITY_OFF; break; case WPAN_SECURITY_LEVEL: if (!ro->seclevel_override) val = WPAN_SECURITY_LEVEL_DEFAULT; else val = ro->seclevel; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int dgram_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); int val; int err = 0; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); switch (optname) { case WPAN_WANTACK: ro->want_ack = !!val; break; case WPAN_WANTLQI: ro->want_lqi = !!val; break; case WPAN_SECURITY: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } switch (val) { case WPAN_SECURITY_DEFAULT: ro->secen_override = 0; break; case WPAN_SECURITY_ON: ro->secen_override = 1; ro->secen = 1; break; case WPAN_SECURITY_OFF: ro->secen_override = 1; ro->secen = 0; break; default: err = -EINVAL; break; } break; case WPAN_SECURITY_LEVEL: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } if (val < WPAN_SECURITY_LEVEL_DEFAULT || val > IEEE802154_SCF_SECLEVEL_ENC_MIC128) { err = -EINVAL; } else if (val == WPAN_SECURITY_LEVEL_DEFAULT) { ro->seclevel_override = 0; } else { ro->seclevel_override = 1; ro->seclevel = val; } break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static struct proto ieee802154_dgram_prot = { .name = "IEEE-802.15.4-MAC", .owner = THIS_MODULE, .obj_size = sizeof(struct dgram_sock), .init = dgram_init, .close = dgram_close, .bind = dgram_bind, .sendmsg = dgram_sendmsg, .recvmsg = dgram_recvmsg, .hash = dgram_hash, .unhash = dgram_unhash, .connect = dgram_connect, .disconnect = dgram_disconnect, .ioctl = dgram_ioctl, .getsockopt = dgram_getsockopt, .setsockopt = dgram_setsockopt, }; static const struct proto_ops ieee802154_dgram_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static void ieee802154_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } /* Create a socket. Initialise the socket, blank the addresses * set the state. */ static int ieee802154_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc; struct proto *proto; const struct proto_ops *ops; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; switch (sock->type) { case SOCK_RAW: rc = -EPERM; if (!capable(CAP_NET_RAW)) goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; case SOCK_DGRAM: proto = &ieee802154_dgram_prot; ops = &ieee802154_dgram_ops; break; default: rc = -ESOCKTNOSUPPORT; goto out; } rc = -ENOMEM; sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; sock->ops = ops; sock_init_data(sock, sk); sk->sk_destruct = ieee802154_sock_destruct; sk->sk_family = PF_IEEE802154; /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); if (sk->sk_prot->hash) { rc = sk->sk_prot->hash(sk); if (rc) { sk_common_release(sk); goto out; } } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); if (rc) sk_common_release(sk); } out: return rc; } static const struct net_proto_family ieee802154_family_ops = { .family = PF_IEEE802154, .create = ieee802154_create, .owner = THIS_MODULE, }; static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (!netif_running(dev)) goto drop; pr_debug("got frame, type %d, dev %p\n", dev->type, dev); #ifdef DEBUG print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len); #endif if (!net_eq(dev_net(dev), &init_net)) goto drop; ieee802154_raw_deliver(dev, skb); if (dev->type != ARPHRD_IEEE802154) goto drop; if (skb->pkt_type != PACKET_OTHERHOST) return ieee802154_dgram_deliver(dev, skb); drop: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type ieee802154_packet_type = { .type = htons(ETH_P_IEEE802154), .func = ieee802154_rcv, }; static int __init af_ieee802154_init(void) { int rc; rc = proto_register(&ieee802154_raw_prot, 1); if (rc) goto out; rc = proto_register(&ieee802154_dgram_prot, 1); if (rc) goto err_dgram; /* Tell SOCKET that we are alive */ rc = sock_register(&ieee802154_family_ops); if (rc) goto err_sock; dev_add_pack(&ieee802154_packet_type); rc = 0; goto out; err_sock: proto_unregister(&ieee802154_dgram_prot); err_dgram: proto_unregister(&ieee802154_raw_prot); out: return rc; } static void __exit af_ieee802154_remove(void) { dev_remove_pack(&ieee802154_packet_type); sock_unregister(PF_IEEE802154); proto_unregister(&ieee802154_dgram_prot); proto_unregister(&ieee802154_raw_prot); } module_init(af_ieee802154_init); module_exit(af_ieee802154_remove); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_IEEE802154); |
53 50 48 46 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_RTNH_H #define __NET_RTNH_H #include <linux/rtnetlink.h> #include <net/netlink.h> static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining) { return remaining >= (int)sizeof(*rtnh) && rtnh->rtnh_len >= sizeof(*rtnh) && rtnh->rtnh_len <= remaining; } static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh, int *remaining) { int totlen = NLA_ALIGN(rtnh->rtnh_len); *remaining -= totlen; return (struct rtnexthop *) ((char *) rtnh + totlen); } static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh) { return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh))); } static inline int rtnh_attrlen(const struct rtnexthop *rtnh) { return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh)); } #endif |
272 168 166 82 82 135 332 332 332 17 332 332 332 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 | // SPDX-License-Identifier: GPL-2.0-only /* * TCP CUBIC: Binary Increase Congestion control for TCP v2.3 * Home page: * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC * This is from the implementation of CUBIC TCP in * Sangtae Ha, Injong Rhee and Lisong Xu, * "CUBIC: A New TCP-Friendly High-Speed TCP Variant" * in ACM SIGOPS Operating System Review, July 2008. * Available from: * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf * * CUBIC integrates a new slow start algorithm, called HyStart. * The details of HyStart are presented in * Sangtae Ha and Injong Rhee, * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008. * Available from: * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf * * All testing results are available from: * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing * * Unless CUBIC is enabled and congestion window is large * this behaves the same as the original Reno. */ #include <linux/mm.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/module.h> #include <linux/math64.h> #include <net/tcp.h> #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ #define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ /* Two methods of hybrid slow start */ #define HYSTART_ACK_TRAIN 0x1 #define HYSTART_DELAY 0x2 /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 #define HYSTART_DELAY_MIN (4000U) /* 4 ms */ #define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ static int initial_ssthresh __read_mostly; static int bic_scale __read_mostly = 41; static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; static u64 cube_factor __read_mostly; /* Note parameters that are used for precomputing scale factors are read-only */ module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(bic_scale, int, 0444); MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); module_param(hystart, int, 0644); MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm"); module_param(hystart_detect, int, 0644); MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); module_param(hystart_ack_delta_us, int, 0644); MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ u16 unused; u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ }; static inline void bictcp_reset(struct bictcp *ca) { memset(ca, 0, offsetof(struct bictcp, unused)); ca->found = 0; } static inline u32 bictcp_clock_us(const struct sock *sk) { return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; ca->curr_rtt = ~0U; ca->sample_cnt = 0; } __bpf_kfunc static void cubictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); if (hystart) bictcp_hystart_reset(sk); if (!hystart && initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } __bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); u32 now = tcp_jiffies32; s32 delta; delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve. */ if (ca->epoch_start && delta > 0) { ca->epoch_start += delta; if (after(ca->epoch_start, now)) ca->epoch_start = now; } return; } } /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% */ static u32 cubic_root(u64 a) { u32 x, b, shift; /* * cbrt(x) MSB values for x MSB values in [0..63]. * Precomputed then refined by hand - Willy Tarreau * * For x in [0..63], * v = cbrt(x << 18) - 1 * cbrt(x) = (v[x] + 10) >> 6 */ static const u8 v[] = { /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, }; b = fls64(a); if (b < 7) { /* a in [0..63] */ return ((u32)v[(u32)a] + 35) >> 6; } b = ((b * 84) >> 8) - 1; shift = (a >> (b * 3)); x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; /* * Newton-Raphson iteration * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1))); x = ((x * 341) >> 10); return x; } /* * Compute congestion window to use. */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) { u32 delta, bic_target, max_cnt; u64 offs, t; ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; /* The CUBIC function can update ca->cnt at most once per jiffy. * On all cwnd reduction events, ca->epoch_start is set to 0, * which will force a recalculation of ca->cnt. */ if (ca->epoch_start && tcp_jiffies32 == ca->last_time) goto tcp_friendliness; ca->last_cwnd = cwnd; ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) { ca->epoch_start = tcp_jiffies32; /* record beginning */ ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ if (ca->last_max_cwnd <= cwnd) { ca->bic_K = 0; ca->bic_origin_point = cwnd; } else { /* Compute new K based on * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */ ca->bic_K = cubic_root(cube_factor * (ca->last_max_cwnd - cwnd)); ca->bic_origin_point = ca->last_max_cwnd; } } /* cubic function - calc*/ /* calculate c * time^3 / rtt, * while considering overflow in calculation of time^3 * (so time^3 is done by using 64 bit) * and without the support of division of 64bit numbers * (so all divisions are done by using 32 bit) * also NOTE the unit of those veriables * time = (t - K) / 2^bictcp_HZ * c = bic_scale >> 10 * rtt = (srtt >> 3) / HZ * !!! The following code does not have overflow problems, * if the cwnd < 1 million packets !!! */ t = (s32)(tcp_jiffies32 - ca->epoch_start); t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; else offs = t - ca->bic_K; /* c/rtt * (t-K)^3 */ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); if (t < ca->bic_K) /* below origin*/ bic_target = ca->bic_origin_point - delta; else /* above origin*/ bic_target = ca->bic_origin_point + delta; /* cubic function - calc bictcp_cnt*/ if (bic_target > cwnd) { ca->cnt = cwnd / (bic_target - cwnd); } else { ca->cnt = 100 * cwnd; /* very small increment*/ } /* * The initial growth of cubic function may be too conservative * when the available bandwidth is still unknown. */ if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ tcp_friendliness: /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; delta = (cwnd * scale) >> 3; while (ca->ack_cnt > delta) { /* update tcp cwnd */ ca->ack_cnt -= delta; ca->tcp_cwnd++; } if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ delta = ca->tcp_cwnd - cwnd; max_cnt = cwnd / delta; if (ca->cnt > max_cnt) ca->cnt = max_cnt; } } /* The maximum rate of cwnd increase CUBIC allows is 1 packet per * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. */ ca->cnt = max(ca->cnt, 2U); } __bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } bictcp_update(ca, tcp_snd_cwnd(tp), acked); tcp_cong_avoid_ai(tp, ca->cnt, acked); } __bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else ca->last_max_cwnd = tcp_snd_cwnd(tp); return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } __bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); bictcp_hystart_reset(sk); } } /* Account for TSO/GRO delays. * Otherwise short RTT flows could get too small ssthresh, since during * slow start we begin with small TSO packets and ca->delay_min would * not account for long aggregation delay when TSO packets get bigger. * Ideally even with a very small RTT we would like to have at least one * TSO packet being sent and received by GRO, and another one in qdisc layer. * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ static u32 hystart_ack_delay(const struct sock *sk) { unsigned long rate; rate = READ_ONCE(sk->sk_pacing_rate); if (!rate) return 0; return min_t(u64, USEC_PER_MSEC, div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate)); } static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 threshold; if (after(tp->snd_una, ca->end_seq)) bictcp_hystart_reset(sk); if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; threshold = ca->delay_min + hystart_ack_delay(sk); /* Hystart ack train triggers if we get ack past * ca->delay_min/2. * Pacing might have delayed packets up to RTT/2 * during slow start. */ if (sk->sk_pacing_status == SK_PACING_NONE) threshold >>= 1; if ((s32)(now - ca->round_start) > threshold) { ca->found = 1; pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", now - ca->round_start, threshold, ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp)); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->curr_rtt > delay) ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } } __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; /* Some calls are for duplicates without timetamps */ if (sample->rtt_us < 0) return; /* Discard delay samples right after fast recovery */ if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; delay = sample->rtt_us; if (delay == 0) delay = 1; /* first time call or link delay decreases */ if (ca->delay_min == 0 || ca->delay_min > delay) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ if (!ca->found && tcp_in_slow_start(tp) && hystart && tcp_snd_cwnd(tp) >= hystart_low_window) hystart_update(sk, delay); } static struct tcp_congestion_ops cubictcp __read_mostly = { .init = cubictcp_init, .ssthresh = cubictcp_recalc_ssthresh, .cong_avoid = cubictcp_cong_avoid, .set_state = cubictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = cubictcp_cwnd_event, .pkts_acked = cubictcp_acked, .owner = THIS_MODULE, .name = "cubic", }; BTF_SET8_START(tcp_cubic_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) #endif #endif BTF_SET8_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { .owner = THIS_MODULE, .set = &tcp_cubic_check_kfunc_ids, }; static int __init cubictcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); /* Precompute a bunch of the scaling factors that are used per-packet * based on SRTT of 100ms */ beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 / (BICTCP_BETA_SCALE - beta); cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 * so K = cubic_root( (wmax-cwnd)*rtt/c ) * the unit of K is bictcp_HZ=2^10, not HZ * * c = bic_scale >> 10 * rtt = 100ms * * the following code has been designed and tested for * cwnd < 1 million packets * RTT < 100 seconds * HZ < 1,000,00 (corresponding to 10 nano-second) */ /* 1/c * 2^2*bictcp_HZ * srtt */ cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set); if (ret < 0) return ret; return tcp_register_congestion_control(&cubictcp); } static void __exit cubictcp_unregister(void) { tcp_unregister_congestion_control(&cubictcp); } module_init(cubictcp_register); module_exit(cubictcp_unregister); MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CUBIC TCP"); MODULE_VERSION("2.3"); |
7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | // SPDX-License-Identifier: GPL-2.0-only /* * crc16.c */ #include <linux/types.h> #include <linux/module.h> #include <linux/crc16.h> /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ u16 const crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; EXPORT_SYMBOL(crc16_table); /** * crc16 - compute the CRC-16 for the data buffer * @crc: previous CRC value * @buffer: data pointer * @len: number of bytes in the buffer * * Returns the updated CRC value. */ u16 crc16(u16 crc, u8 const *buffer, size_t len) { while (len--) crc = crc16_byte(crc, *buffer++); return crc; } EXPORT_SYMBOL(crc16); MODULE_DESCRIPTION("CRC16 calculations"); MODULE_LICENSE("GPL"); |
1255 63 1191 835 63 771 1255 772 1255 122 122 122 122 122 122 122 122 122 122 32 58 58 58 58 122 122 122 122 122 5 10 10 58 53 54 122 10 122 58 122 122 122 121 112 112 1072 1070 1069 1070 1072 1072 609 609 114 773 265 3 773 115 115 1 114 2 1 1 1 1 772 770 768 770 769 771 1269 1267 1266 1267 1267 3 53 63 62 61 59 58 56 55 2 53 54 2 53 2 1 5 1 53 54 63 1283 1243 43 1284 4 30 1 766 17 765 8 8 7 7 24 938 8 7 2 500 5 939 940 939 1 938 939 938 939 13 3 936 129 84 936 937 931 500 2 500 498 499 506 509 3 3 2 1 1 1 2 216 2 1 274 273 271 269 131 35 140 268 268 8 6 5 4 3 262 13 261 259 12 12 12 12 12 258 134 259 122 137 216 218 216 217 94 219 9 220 227 232 62 61 60 60 58 1 57 4 56 56 46 46 45 47 52 18 17 16 14 13 12 11 9 1 8 7 1 7 7 7 7 7 7 7 7 7 11 18 31 30 29 27 26 25 24 3 23 23 2 23 31 2 1 1 7 7 6 5 1 4 2 2 2 2 4 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/vmalloc.h> #include <linux/etherdevice.h> #include <linux/filter.h> #include <linux/rcupdate_trace.h> #include <linux/sched/signal.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> #include <net/page_pool/helpers.h> #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> #include <linux/netfilter.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include <net/netfilter/nf_bpf_link.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> struct bpf_test_timer { enum { NO_PREEMPT, NO_MIGRATE } mode; u32 i; u64 time_start, time_spent; }; static void bpf_test_timer_enter(struct bpf_test_timer *t) __acquires(rcu) { rcu_read_lock(); if (t->mode == NO_PREEMPT) preempt_disable(); else migrate_disable(); t->time_start = ktime_get_ns(); } static void bpf_test_timer_leave(struct bpf_test_timer *t) __releases(rcu) { t->time_start = 0; if (t->mode == NO_PREEMPT) preempt_enable(); else migrate_enable(); rcu_read_unlock(); } static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, u32 repeat, int *err, u32 *duration) __must_hold(rcu) { t->i += iterations; if (t->i >= repeat) { /* We're done. */ t->time_spent += ktime_get_ns() - t->time_start; do_div(t->time_spent, t->i); *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; *err = 0; goto reset; } if (signal_pending(current)) { /* During iteration: we've been cancelled, abort. */ *err = -EINTR; goto reset; } if (need_resched()) { /* During iteration: we need to reschedule between runs. */ t->time_spent += ktime_get_ns() - t->time_start; bpf_test_timer_leave(t); cond_resched(); bpf_test_timer_enter(t); } /* Do another round. */ return true; reset: t->i = 0; return false; } /* We put this struct at the head of each page with a context and frame * initialised when the page is allocated, so we don't have to do this on each * repetition of the test run. */ struct xdp_page_head { struct xdp_buff orig_ctx; struct xdp_buff ctx; union { /* ::data_hard_start starts here */ DECLARE_FLEX_ARRAY(struct xdp_frame, frame); DECLARE_FLEX_ARRAY(u8, data); }; }; struct xdp_test_data { struct xdp_buff *orig_ctx; struct xdp_rxq_info rxq; struct net_device *dev; struct page_pool *pp; struct xdp_frame **frames; struct sk_buff **skbs; struct xdp_mem_info mem; u32 batch_size; u32 frame_cnt; }; /* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE * must be updated accordingly this gets changed, otherwise BPF selftests * will fail. */ #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head)) #define TEST_XDP_MAX_BATCH 256 static void xdp_test_run_init_page(struct page *page, void *arg) { struct xdp_page_head *head = phys_to_virt(page_to_phys(page)); struct xdp_buff *new_ctx, *orig_ctx; u32 headroom = XDP_PACKET_HEADROOM; struct xdp_test_data *xdp = arg; size_t frm_len, meta_len; struct xdp_frame *frm; void *data; orig_ctx = xdp->orig_ctx; frm_len = orig_ctx->data_end - orig_ctx->data_meta; meta_len = orig_ctx->data - orig_ctx->data_meta; headroom -= meta_len; new_ctx = &head->ctx; frm = head->frame; data = head->data; memcpy(data + headroom, orig_ctx->data_meta, frm_len); xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq); xdp_prepare_buff(new_ctx, data, headroom, frm_len, true); new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); frm->mem = new_ctx->rxq->mem; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx) { struct page_pool *pp; int err = -ENOMEM; struct page_pool_params pp_params = { .order = 0, .flags = 0, .pool_size = xdp->batch_size, .nid = NUMA_NO_NODE, .init_callback = xdp_test_run_init_page, .init_arg = xdp, }; xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->frames) return -ENOMEM; xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->skbs) goto err_skbs; pp = page_pool_create(&pp_params); if (IS_ERR(pp)) { err = PTR_ERR(pp); goto err_pp; } /* will copy 'mem.id' into pp->xdp_mem_id */ err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp); if (err) goto err_mmodel; xdp->pp = pp; /* We create a 'fake' RXQ referencing the original dev, but with an * xdp_mem_info pointing to our page_pool */ xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0); xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL; xdp->rxq.mem.id = pp->xdp_mem_id; xdp->dev = orig_ctx->rxq->dev; xdp->orig_ctx = orig_ctx; return 0; err_mmodel: page_pool_destroy(pp); err_pp: kvfree(xdp->skbs); err_skbs: kvfree(xdp->frames); return err; } static void xdp_test_run_teardown(struct xdp_test_data *xdp) { xdp_unreg_mem_model(&xdp->mem); page_pool_destroy(xdp->pp); kfree(xdp->frames); kfree(xdp->skbs); } static bool frame_was_changed(const struct xdp_page_head *head) { /* xdp_scrub_frame() zeroes the data pointer, flags is the last field, * i.e. has the highest chances to be overwritten. If those two are * untouched, it's most likely safe to skip the context reset. */ return head->frame->data != head->orig_ctx.data || head->frame->flags != head->orig_ctx.flags; } static bool ctx_was_changed(struct xdp_page_head *head) { return head->orig_ctx.data != head->ctx.data || head->orig_ctx.data_meta != head->ctx.data_meta || head->orig_ctx.data_end != head->ctx.data_end; } static void reset_ctx(struct xdp_page_head *head) { if (likely(!frame_was_changed(head) && !ctx_was_changed(head))) return; head->ctx.data = head->orig_ctx.data; head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, struct sk_buff **skbs, struct net_device *dev) { gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; int i, n; LIST_HEAD(list); n = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, (void **)skbs); if (unlikely(n == 0)) { for (i = 0; i < nframes; i++) xdp_return_frame(frames[i]); return -ENOMEM; } for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(xdpf, skb, dev); if (!skb) { xdp_return_frame(xdpf); continue; } list_add_tail(&skb->list, &list); } netif_receive_skb_list(&list); return 0; } static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, u32 repeat) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); int err = 0, act, ret, i, nframes = 0, batch_sz; struct xdp_frame **frames = xdp->frames; struct xdp_page_head *head; struct xdp_frame *frm; bool redirect = false; struct xdp_buff *ctx; struct page *page; batch_sz = min_t(u32, repeat, xdp->batch_size); local_bh_disable(); xdp_set_return_frame_no_direct(); for (i = 0; i < batch_sz; i++) { page = page_pool_dev_alloc_pages(xdp->pp); if (!page) { err = -ENOMEM; goto out; } head = phys_to_virt(page_to_phys(page)); reset_ctx(head); ctx = &head->ctx; frm = head->frame; xdp->frame_cnt++; act = bpf_prog_run_xdp(prog, ctx); /* if program changed pkt bounds we need to update the xdp_frame */ if (unlikely(ctx_was_changed(head))) { ret = xdp_update_frame_from_buff(ctx, frm); if (ret) { xdp_return_buff(ctx); continue; } } switch (act) { case XDP_TX: /* we can't do a real XDP_TX since we're not in the * driver, so turn it into a REDIRECT back to the same * index */ ri->tgt_index = xdp->dev->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; fallthrough; case XDP_REDIRECT: redirect = true; ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog); if (ret) xdp_return_buff(ctx); break; case XDP_PASS: frames[nframes++] = frm; break; default: bpf_warn_invalid_xdp_action(NULL, prog, act); fallthrough; case XDP_DROP: xdp_return_buff(ctx); break; } } out: if (redirect) xdp_do_flush(); if (nframes) { ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev); if (ret) err = ret; } xdp_clear_return_frame_no_direct(); local_bh_enable(); return err; } static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, u32 repeat, u32 batch_size, u32 *time) { struct xdp_test_data xdp = { .batch_size = batch_size }; struct bpf_test_timer t = { .mode = NO_MIGRATE }; int ret; if (!repeat) repeat = 1; ret = xdp_test_run_setup(&xdp, ctx); if (ret) return ret; bpf_test_timer_enter(&t); do { xdp.frame_cnt = 0; ret = xdp_test_run_batch(&xdp, prog, repeat - t.i); if (unlikely(ret < 0)) break; } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time)); bpf_test_timer_leave(&t); xdp_test_run_teardown(&xdp); return ret; } static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(item.cgroup_storage[stype])) { item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } if (!repeat) repeat = 1; bpf_test_timer_enter(&t); old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { run_ctx.prog_item = &item; local_bh_disable(); if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); local_bh_enable(); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, struct skb_shared_info *sinfo, u32 size, u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; u32 copy_size = size; /* Clamp copy if the user has provided a size hint, but copy the full * buffer if not to retain old behaviour. */ if (kattr->test.data_size_out && copy_size > kattr->test.data_size_out) { copy_size = kattr->test.data_size_out; err = -ENOSPC; } if (data_out) { int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; if (len < 0) { err = -ENOSPC; goto out; } if (copy_to_user(data_out, data, len)) goto out; if (sinfo) { int i, offset = len; u32 data_len; for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; if (offset >= copy_size) { err = -ENOSPC; break; } data_len = min_t(u32, copy_size - offset, skb_frag_size(frag)); if (copy_to_user(data_out + offset, skb_frag_address(frag), data_len)) goto out; offset += data_len; } } } if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) goto out; if (err != -ENOSPC) err = 0; out: trace_bpf_test_finish(&err); return err; } /* Integer types of various sizes and pointer combinations cover variety of * architecture dependent calling conventions. 7+ can be supported in the * future. */ __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_fentry_test1(int a) { return a + 1; } EXPORT_SYMBOL_GPL(bpf_fentry_test1); int noinline bpf_fentry_test2(int a, u64 b) { return a + b; } int noinline bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) { asm volatile ("": "+r"(arg)); return (long)arg; } int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } __bpf_kfunc u32 bpf_fentry_test9(u32 *a) { return *a; } void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, void *e, char f, int g) { *b += 1; return a + *b + c + d + (long)e + f + g; } int noinline bpf_fentry_shadow_test(int a) { return a + 1; } struct prog_test_member1 { int a; }; struct prog_test_member { struct prog_test_member1 m; int c; }; struct prog_test_ref_kfunc { int a; int b; struct prog_test_member memb; struct prog_test_ref_kfunc *next; refcount_t cnt; }; __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) { refcount_dec(&p->cnt); } __bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p) { bpf_kfunc_call_test_release(p); } CFI_NOSEAL(bpf_kfunc_call_test_release_dtor); __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } __bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p) { } CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); __bpf_kfunc_end_defs(); BTF_SET8_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_SET8_END(bpf_test_modify_return_ids) static const struct btf_kfunc_id_set bpf_test_modify_return_set = { .owner = THIS_MODULE, .set = &bpf_test_modify_return_ids, }; BTF_SET8_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) BTF_SET8_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); if (user_size > size) return ERR_PTR(-EMSGSIZE); size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } return data; } int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_fentry_test_t arg = {}; u16 side_effect = 0, ret = 0; int b = 2, err = -EFAULT; u32 retval = 0; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || bpf_fentry_test8(&arg) != 0 || bpf_fentry_test9(&retval) != 0) goto out; break; case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) side_effect++; b = 2; ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); if (b != 2) side_effect++; break; default: goto out; } retval = ((u32)side_effect << 16) | ret; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; err = 0; out: trace_bpf_test_finish(&err); return err; } struct bpf_raw_tp_test_run_info { struct bpf_prog *prog; void *ctx; u32 retval; }; static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; rcu_read_lock(); info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); } int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; struct bpf_raw_tp_test_run_info info; int cpu = kattr->test.cpu, err = 0; int current_cpu; /* doesn't support data_in/out, ctx_out, duration, or repeat */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) return -EINVAL; if (ctx_size_in) { info.ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(info.ctx)) return PTR_ERR(info.ctx); } else { info.ctx = NULL; } info.prog = prog; current_cpu = get_cpu(); if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || cpu == current_cpu) { __bpf_prog_test_run_raw_tp(&info); } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { /* smp_call_function_single() also checks cpu_online() * after csd_lock(). However, since cpu is from user * space, let's do an extra quick check to filter out * invalid value before smp_call_function_single(). */ err = -ENXIO; } else { err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, &info, 1); } put_cpu(); if (!err && copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) err = -EFAULT; kfree(info.ctx); return err; } static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) { void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); u32 size = kattr->test.ctx_size_in; void *data; int err; if (!data_in && !data_out) return NULL; data = kzalloc(max_size, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (data_in) { err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size); if (err) { kfree(data); return ERR_PTR(err); } size = min_t(u32, max_size, size); if (copy_from_user(data, data_in, size)) { kfree(data); return ERR_PTR(-EFAULT); } } return data; } static int bpf_ctx_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, u32 size) { void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); int err = -EFAULT; u32 copy_size = size; if (!data || !data_out) return 0; if (copy_size > kattr->test.ctx_size_out) { copy_size = kattr->test.ctx_size_out; err = -ENOSPC; } if (copy_to_user(data_out, data, copy_size)) goto out; if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) goto out; if (err != -ENOSPC) err = 0; out: return err; } /** * range_is_zero - test whether buffer is initialized * @buf: buffer to check * @from: check from this position * @to: check up until (excluding) this position * * This function returns true if the there is a non-zero byte * in the buf in the range [from,to). */ static inline bool range_is_zero(void *buf, size_t from, size_t to) { return !memchr_inv((u8 *)buf + from, 0, to - from); } static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return 0; /* make sure the fields we don't use are zeroed */ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) return -EINVAL; /* mark is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ /* ingress_ifindex is allowed */ /* ifindex is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), offsetof(struct __sk_buff, tstamp))) return -EINVAL; /* tstamp is allowed */ /* wire_len is allowed */ /* gso_segs is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), offsetof(struct __sk_buff, gso_size))) return -EINVAL; /* gso_size is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), offsetof(struct __sk_buff, hwtstamp))) return -EINVAL; /* hwtstamp is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, hwtstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->mark = __skb->mark; skb->priority = __skb->priority; skb->skb_iif = __skb->ingress_ifindex; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); if (__skb->wire_len == 0) { cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; return 0; } static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return; __skb->mark = skb->mark; __skb->priority = skb->priority; __skb->ingress_ifindex = skb->skb_iif; __skb->ifindex = skb->dev->ifindex; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); __skb->wire_len = cb->pkt_len; __skb->gso_segs = skb_shinfo(skb)->gso_segs; __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp; } static struct proto bpf_dummy_proto = { .name = "bpf_dummy", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), }; int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; struct sock *sk; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); if (IS_ERR(ctx)) { kfree(data); return PTR_ERR(ctx); } switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: is_l2 = true; fallthrough; case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: is_direct_pkt_access = true; break; default: break; } sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } sock_init_data(NULL, sk); skb = slab_build_skb(data); if (!skb) { kfree(data); kfree(ctx); sk_free(sk); return -ENOMEM; } skb->sk = sk; skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); if (ctx && ctx->ifindex > 1) { dev = dev_get_by_index(net, ctx->ifindex); if (!dev) { ret = -ENODEV; goto out; } } skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); switch (skb->protocol) { case htons(ETH_P_IP): sk->sk_family = AF_INET; if (sizeof(struct iphdr) <= skb_headlen(skb)) { sk->sk_rcv_saddr = ip_hdr(skb)->saddr; sk->sk_daddr = ip_hdr(skb)->daddr; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): sk->sk_family = AF_INET6; if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) { sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr; sk->sk_v6_daddr = ipv6_hdr(skb)->daddr; } break; #endif default: break; } if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { ret = -ENOMEM; goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); sk_free(sk); kfree(ctx); return ret; } static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp) { unsigned int ingress_ifindex, rx_queue_index; struct netdev_rx_queue *rxqueue; struct net_device *device; if (!xdp_md) return 0; if (xdp_md->egress_ifindex != 0) return -EINVAL; ingress_ifindex = xdp_md->ingress_ifindex; rx_queue_index = xdp_md->rx_queue_index; if (!ingress_ifindex && rx_queue_index) return -EINVAL; if (ingress_ifindex) { device = dev_get_by_index(current->nsproxy->net_ns, ingress_ifindex); if (!device) return -ENODEV; if (rx_queue_index >= device->real_num_rx_queues) goto free_dev; rxqueue = __netif_get_rx_queue(device, rx_queue_index); if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq)) goto free_dev; xdp->rxq = &rxqueue->xdp_rxq; /* The device is now tracked in the xdp->rxq for later * dev_put() */ } xdp->data = xdp->data_meta + xdp_md->data; return 0; free_dev: dev_put(device); return -EINVAL; } static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) { if (!xdp_md) return; xdp_md->data = xdp->data - xdp->data_meta; xdp_md->data_end = xdp->data_end - xdp->data_meta; if (xdp_md->ingress_ifindex) dev_put(xdp->rxq->dev); } int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); u32 batch_size = kattr->test.batch_size; u32 retval = 0, duration, max_data_sz; u32 size = kattr->test.data_size_in; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct skb_shared_info *sinfo; struct xdp_buff xdp = {}; int i, ret = -EINVAL; struct xdp_md *ctx; void *data; if (prog->expected_attach_type == BPF_XDP_DEVMAP || prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) return -EINVAL; if (bpf_prog_is_dev_bound(prog->aux)) return -EINVAL; if (do_live) { if (!batch_size) batch_size = NAPI_POLL_WEIGHT; else if (batch_size > TEST_XDP_MAX_BATCH) return -E2BIG; headroom += sizeof(struct xdp_page_head); } else if (batch_size) { return -EINVAL; } ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (ctx) { /* There can't be user provided data before the meta data */ if (ctx->data_meta || ctx->data_end != size || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; } max_data_sz = 4096 - headroom - tailroom; if (size > max_data_sz) { /* disallow live data mode for jumbo frames */ if (do_live) goto free_ctx; size = max_data_sz; } data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, size, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); while (size < kattr->test.data_size_in) { struct page *page; skb_frag_t *frag; u32 data_len; if (sinfo->nr_frags == MAX_SKB_FRAGS) { ret = -ENOMEM; goto out; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } frag = &sinfo->frags[sinfo->nr_frags++]; data_len = min_t(u32, kattr->test.data_size_in - size, PAGE_SIZE); skb_frag_fill_page_desc(frag, page, 0, data_len); if (copy_from_user(page_address(page), data_in + size, data_len)) { ret = -EFAULT; goto out; } sinfo->xdp_frags_size += data_len; size += data_len; } xdp_buff_set_frags_flag(&xdp); } if (repeat > 1) bpf_prog_change_xdp(NULL, prog); if (do_live) ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration); else ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented * even if the test run failed. */ xdp_convert_buff_to_md(&xdp, ctx); if (ret) goto out; size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct xdp_md)); out: if (repeat > 1) bpf_prog_change_xdp(prog, NULL); free_data: for (i = 0; i < sinfo->nr_frags; i++) __free_page(skb_frag_page(&sinfo->frags[i])); kfree(data); free_ctx: kfree(ctx); return ret; } static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) { /* make sure the fields we don't use are zeroed */ if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) return -EINVAL; /* flags is allowed */ if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), sizeof(struct bpf_flow_keys))) return -EINVAL; return 0; } int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = { NO_PREEMPT }; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; const struct ethhdr *eth; unsigned int flags = 0; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < ETH_HLEN) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0); if (IS_ERR(data)) return PTR_ERR(data); eth = (struct ethhdr *)data; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_user_bpf_flow_keys(user_ctx); if (ret) goto out; flags = user_ctx->flags; } ctx.flow_keys = &flow_keys; ctx.data = data; ctx.data_end = (__u8 *)data + size; bpf_test_timer_enter(&t); do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, sizeof(flow_keys), retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); out: kfree(user_ctx); kfree(data); return ret; } int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = { NO_PREEMPT }; struct bpf_prog_array *progs = NULL; struct bpf_sk_lookup_kern ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_sk_lookup *user_ctx; u32 retval, duration; int ret = -EINVAL; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || kattr->test.data_size_out) return -EINVAL; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); if (IS_ERR(user_ctx)) return PTR_ERR(user_ctx); if (!user_ctx) return -EINVAL; if (user_ctx->sk) goto out; if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) goto out; if (user_ctx->local_port > U16_MAX) { ret = -ERANGE; goto out; } ctx.family = (u16)user_ctx->family; ctx.protocol = (u16)user_ctx->protocol; ctx.dport = (u16)user_ctx->local_port; ctx.sport = user_ctx->remote_port; switch (ctx.family) { case AF_INET: ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; break; #endif default: ret = -EAFNOSUPPORT; goto out; } progs = bpf_prog_array_alloc(1, GFP_KERNEL); if (!progs) { ret = -ENOMEM; goto out; } progs->items[0].prog = prog; bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; user_ctx->cookie = 0; if (ctx.selected_sk) { if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { ret = -EOPNOTSUPP; goto out; } user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); out: bpf_prog_array_free(progs); kfree(user_ctx); return ret; } int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; void *ctx = NULL; u32 retval; int err = 0; /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.flags || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > U16_MAX) return -EINVAL; if (ctx_size_in) { ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(ctx)) return PTR_ERR(ctx); } rcu_read_lock_trace(); retval = bpf_prog_run_pin_on_cpu(prog, ctx); rcu_read_unlock_trace(); if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { err = -EFAULT; goto out; } if (ctx_size_in) if (copy_to_user(ctx_in, ctx, ctx_size_in)) err = -EFAULT; out: kfree(ctx); return err; } static int verify_and_copy_hook_state(struct nf_hook_state *state, const struct nf_hook_state *user, struct net_device *dev) { if (user->in || user->out) return -EINVAL; if (user->net || user->sk || user->okfn) return -EINVAL; switch (user->pf) { case NFPROTO_IPV4: case NFPROTO_IPV6: switch (state->hook) { case NF_INET_PRE_ROUTING: state->in = dev; break; case NF_INET_LOCAL_IN: state->in = dev; break; case NF_INET_FORWARD: state->in = dev; state->out = dev; break; case NF_INET_LOCAL_OUT: state->out = dev; break; case NF_INET_POST_ROUTING: state->out = dev; break; } break; default: return -EINVAL; } state->pf = user->pf; state->hook = user->hook; return 0; } static __be16 nfproto_eth(int nfproto) { switch (nfproto) { case NFPROTO_IPV4: return htons(ETH_P_IP); case NFPROTO_IPV6: break; } return htons(ETH_P_IPV6); } int bpf_prog_test_run_nf(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; struct nf_hook_state *user_ctx, hook_state = { .pf = NFPROTO_IPV4, .hook = NF_INET_LOCAL_OUT, }; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct bpf_nf_ctx ctx = { .state = &hook_state, }; struct sk_buff *skb = NULL; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < sizeof(struct iphdr)) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev); if (ret) goto out; } skb = slab_build_skb(data); if (!skb) { ret = -ENOMEM; goto out; } data = NULL; /* data released via kfree_skb */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); ret = -EINVAL; if (hook_state.hook != NF_INET_LOCAL_OUT) { if (size < ETH_HLEN + sizeof(struct iphdr)) goto out; skb->protocol = eth_type_trans(skb, dev); switch (skb->protocol) { case htons(ETH_P_IP): if (hook_state.pf == NFPROTO_IPV4) break; goto out; case htons(ETH_P_IPV6): if (size < ETH_HLEN + sizeof(struct ipv6hdr)) goto out; if (hook_state.pf == NFPROTO_IPV6) break; goto out; default: ret = -EPROTO; goto out; } skb_reset_network_header(skb); } else { skb->protocol = nfproto_eth(hook_state.pf); } ctx.skb = skb; ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false); if (ret) goto out; ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); out: kfree(user_ctx); kfree_skb(skb); kfree(data); return ret; } static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { .owner = THIS_MODULE, .set = &test_sk_check_kfunc_ids, }; BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(func, bpf_kfunc_call_test_release_dtor) BTF_ID(struct, prog_test_member) BTF_ID(func, bpf_kfunc_call_memb_release_dtor) static int __init bpf_prog_test_run_init(void) { const struct btf_id_dtor_kfunc bpf_prog_test_dtor_kfunc[] = { { .btf_id = bpf_prog_test_dtor_kfunc_ids[0], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[1] }, { .btf_id = bpf_prog_test_dtor_kfunc_ids[2], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[3], }, }; int ret; ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, ARRAY_SIZE(bpf_prog_test_dtor_kfunc), THIS_MODULE); } late_initcall(bpf_prog_test_run_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking expectations. */ #ifndef _NF_CONNTRACK_EXPECT_H #define _NF_CONNTRACK_EXPECT_H #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_zones.h> extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; extern struct hlist_head *nf_ct_expect_hash; struct nf_conntrack_expect { /* Conntrack expectation list member */ struct hlist_node lnode; /* Hash member */ struct hlist_node hnode; /* We expect this tuple, with the following mask */ struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_mask mask; /* Usage count. */ refcount_t use; /* Flags */ unsigned int flags; /* Expectation class */ unsigned int class; /* Function to call after setup and insertion */ void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); /* Helper to assign to new connection */ struct nf_conntrack_helper *helper; /* The conntrack of the master connection */ struct nf_conn *master; /* Timer function; deletes the expectation. */ struct timer_list timeout; #if IS_ENABLED(CONFIG_NF_NAT) union nf_inet_addr saved_addr; /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ union nf_conntrack_man_proto saved_proto; /* Direction relative to the master connection. */ enum ip_conntrack_dir dir; #endif struct rcu_head rcu; }; static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp) { return nf_ct_net(exp->master); } #define NF_CT_EXP_POLICY_NAME_LEN 16 struct nf_conntrack_expect_policy { unsigned int max_expected; unsigned int timeout; char name[NF_CT_EXP_POLICY_NAME_LEN]; }; #define NF_CT_EXPECT_CLASS_DEFAULT 0 #define NF_CT_EXPECT_MAX_CNT 255 /* Allow to reuse expectations with the same tuples from different master * conntracks. */ #define NF_CT_EXP_F_SKIP_MASTER 0x1 int nf_conntrack_expect_pernet_init(struct net *net); void nf_conntrack_expect_pernet_fini(struct net *net); int nf_conntrack_expect_init(void); void nf_conntrack_expect_fini(void); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, bool unlink); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { nf_ct_unlink_expect_report(exp, 0, 0); } void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); bool nf_ct_remove_expect(struct nf_conntrack_expect *exp); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report); /* Allocate space for an expectation: this is mandatory before calling nf_ct_expect_related. You will have to call put afterwards. */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me); void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, const union nf_inet_addr *, const union nf_inet_addr *, u_int8_t, const __be16 *, const __be16 *); void nf_ct_expect_put(struct nf_conntrack_expect *exp); int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags); static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect, unsigned int flags) { return nf_ct_expect_related_report(expect, 0, 0, flags); } #endif /*_NF_CONNTRACK_EXPECT_H*/ |
1567 1450 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_INTERNAL_H #define _LINUX_HIGHMEM_INTERNAL_H /* * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); static inline void kmap_assert_nomap(void) { DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); } #else static inline void kmap_local_fork(struct task_struct *tsk) { } static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM #include <asm/highmem.h> #ifndef ARCH_HAS_KMAP_FLUSH_TLB static inline void kmap_flush_tlb(unsigned long addr) { } #endif #ifndef kmap_prot #define kmap_prot PAGE_KERNEL #endif void *kmap_high(struct page *page); void kunmap_high(struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); static inline void *kmap(struct page *page) { void *addr; might_sleep(); if (!PageHighMem(page)) addr = page_address(page); else addr = kmap_high(page); kmap_flush_tlb((unsigned long)addr); return addr; } static inline void kunmap(struct page *page) { might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); } static inline struct page *kmap_to_page(void *addr) { return __kmap_to_page(addr); } static inline void kmap_flush_unused(void) { __kmap_flush_unused(); } static inline void *kmap_local_page(struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } static inline void *kmap_local_pfn(unsigned long pfn) { return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_local(const void *vaddr) { kunmap_local_indexed(vaddr); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_page_prot(page, prot); } static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } static inline void *kmap_atomic_pfn(unsigned long pfn) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_atomic(const void *addr) { kunmap_local_indexed(addr); pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } unsigned int __nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; static inline unsigned int nr_free_highpages(void) { return __nr_free_highpages(); } static inline unsigned long totalhigh_pages(void) { return (unsigned long)atomic_long_read(&_totalhigh_pages); } static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); } static inline bool is_kmap_addr(const void *x) { unsigned long addr = (unsigned long)x; return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) || (addr >= __fix_to_virt(FIX_KMAP_END) && addr < __fix_to_virt(FIX_KMAP_BEGIN)); } #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) { return virt_to_page(addr); } static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } static inline void kunmap_high(struct page *page) { } static inline void kmap_flush_unused(void) { } static inline void kunmap(struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } static inline void *kmap_local_page(struct page *page) { return page_address(page); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { return page_address(&folio->page) + offset; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); } static inline void *kmap_local_pfn(unsigned long pfn) { return kmap_local_page(pfn_to_page(pfn)); } static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } static inline void *kmap_atomic(struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return page_address(page); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { return kmap_atomic(page); } static inline void *kmap_atomic_pfn(unsigned long pfn) { return kmap_atomic(pfn_to_page(pfn)); } static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0UL; } static inline bool is_kmap_addr(const void *x) { return false; } #endif /* CONFIG_HIGHMEM */ /** * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated! * @__addr: Virtual address to be unmapped * * Unmaps an address previously mapped by kmap_atomic() and re-enables * pagefaults. Depending on PREEMP_RT configuration, re-enables also * migration and preemption. Users should not count on these side effects. * * Mappings should be unmapped in the reverse order that they were mapped. * See kmap_local_page() for details on nesting. * * @__addr can be any address within the mapped page, so there is no need * to subtract any offset that has been added. In contrast to kunmap(), * this function takes the address returned from kmap_atomic(), not the * page passed to it. The compiler will warn you if you pass the page. */ #define kunmap_atomic(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_atomic(__addr); \ } while (0) /** * kunmap_local - Unmap a page mapped via kmap_local_page(). * @__addr: An address within the page mapped * * @__addr can be any address within the mapped page. Commonly it is the * address return from kmap_local_page(), but it can also include offsets. * * Unmapping should be done in the reverse order of the mapping. See * kmap_local_page() for details. */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_local(__addr); \ } while (0) #endif |
1660 1660 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_PRINTK_RINGBUFFER_H #define _KERNEL_PRINTK_RINGBUFFER_H #include <linux/atomic.h> #include <linux/dev_printk.h> /* * Meta information about each stored message. * * All fields are set by the printk code except for @seq, which is * set by the ringbuffer code. */ struct printk_info { u64 seq; /* sequence number */ u64 ts_nsec; /* timestamp in nanoseconds */ u16 text_len; /* length of text message */ u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ struct dev_printk_info dev_info; }; /* * A structure providing the buffers, used by writers and readers. * * Writers: * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to * buffers reserved for that writer. * * Readers: * Using prb_rec_init_rd(), a reader sets all fields before calling * prb_read_valid(). Note that the reader provides the @info and @text_buf, * buffers. On success, the struct pointed to by @info will be filled and * the char array pointed to by @text_buf will be filled with text data. */ struct printk_record { struct printk_info *info; char *text_buf; unsigned int text_buf_size; }; /* Specifies the logical position and span of a data block. */ struct prb_data_blk_lpos { unsigned long begin; unsigned long next; }; /* * A descriptor: the complete meta-data for a record. * * @state_var: A bitwise combination of descriptor ID and descriptor state. */ struct prb_desc { atomic_long_t state_var; struct prb_data_blk_lpos text_blk_lpos; }; /* A ringbuffer of "ID + data" elements. */ struct prb_data_ring { unsigned int size_bits; char *data; atomic_long_t head_lpos; atomic_long_t tail_lpos; }; /* A ringbuffer of "struct prb_desc" elements. */ struct prb_desc_ring { unsigned int count_bits; struct prb_desc *descs; struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; atomic_long_t last_finalized_id; }; /* * The high level structure representing the printk ringbuffer. * * @fail: Count of failed prb_reserve() calls where not even a data-less * record was created. */ struct printk_ringbuffer { struct prb_desc_ring desc_ring; struct prb_data_ring text_data_ring; atomic_long_t fail; }; /* * Used by writers as a reserve/commit handle. * * @rb: Ringbuffer where the entry is reserved. * @irqflags: Saved irq flags to restore on entry commit. * @id: ID of the reserved descriptor. * @text_space: Total occupied buffer space in the text data ring, including * ID, alignment padding, and wrapping data blocks. * * This structure is an opaque handle for writers. Its contents are only * to be used by the ringbuffer implementation. */ struct prb_reserved_entry { struct printk_ringbuffer *rb; unsigned long irqflags; unsigned long id; unsigned int text_space; }; /* The possible responses of a descriptor state-query. */ enum desc_state { desc_miss = -1, /* ID mismatch (pseudo state) */ desc_reserved = 0x0, /* reserved, in use by writer */ desc_committed = 0x1, /* committed by writer, could get reopened */ desc_finalized = 0x2, /* committed, no further modification allowed */ desc_reusable = 0x3, /* free, not yet used by any writer */ }; #define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) #define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) #define DESC_SV_BITS (sizeof(unsigned long) * 8) #define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) #define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) #define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) #define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) #define DESC_ID_MASK (~DESC_FLAGS_MASK) #define DESC_ID(sv) ((sv) & DESC_ID_MASK) #define FAILED_LPOS 0x1 #define NO_LPOS 0x3 #define FAILED_BLK_LPOS \ { \ .begin = FAILED_LPOS, \ .next = FAILED_LPOS, \ } /* * Descriptor Bootstrap * * The descriptor array is minimally initialized to allow immediate usage * by readers and writers. The requirements that the descriptor array * initialization must satisfy: * * Req1 * The tail must point to an existing (committed or reusable) descriptor. * This is required by the implementation of prb_first_seq(). * * Req2 * Readers must see that the ringbuffer is initially empty. * * Req3 * The first record reserved by a writer is assigned sequence number 0. * * To satisfy Req1, the tail initially points to a descriptor that is * minimally initialized (having no data block, i.e. data-less with the * data block's lpos @begin and @next values set to FAILED_LPOS). * * To satisfy Req2, the initial tail descriptor is initialized to the * reusable state. Readers recognize reusable descriptors as existing * records, but skip over them. * * To satisfy Req3, the last descriptor in the array is used as the initial * head (and tail) descriptor. This allows the first record reserved by a * writer (head + 1) to be the first descriptor in the array. (Only the first * descriptor in the array could have a valid sequence number of 0.) * * The first time a descriptor is reserved, it is assigned a sequence number * with the value of the array index. A "first time reserved" descriptor can * be recognized because it has a sequence number of 0 but does not have an * index of 0. (Only the first descriptor in the array could have a valid * sequence number of 0.) After the first reservation, all future reservations * (recycling) simply involve incrementing the sequence number by the array * count. * * Hack #1 * Only the first descriptor in the array is allowed to have the sequence * number 0. In this case it is not possible to recognize if it is being * reserved the first time (set to index value) or has been reserved * previously (increment by the array count). This is handled by _always_ * incrementing the sequence number by the array count when reserving the * first descriptor in the array. In order to satisfy Req3, the sequence * number of the first descriptor in the array is initialized to minus * the array count. Then, upon the first reservation, it is incremented * to 0, thus satisfying Req3. * * Hack #2 * prb_first_seq() can be called at any time by readers to retrieve the * sequence number of the tail descriptor. However, due to Req2 and Req3, * initially there are no records to report the sequence number of * (sequence numbers are u64 and there is nothing less than 0). To handle * this, the sequence number of the initial tail descriptor is initialized * to 0. Technically this is incorrect, because there is no record with * sequence number 0 (yet) and the tail descriptor is not the first * descriptor in the array. But it allows prb_read_valid() to correctly * report the existence of a record for _any_ given sequence number at all * times. Bootstrapping is complete when the tail is pushed the first * time, thus finally pointing to the first descriptor reserved by a * writer, which has the assigned sequence number 0. */ /* * Initiating Logical Value Overflows * * Both logical position (lpos) and ID values can be mapped to array indexes * but may experience overflows during the lifetime of the system. To ensure * that printk_ringbuffer can handle the overflows for these types, initial * values are chosen that map to the correct initial array indexes, but will * result in overflows soon. * * BLK0_LPOS * The initial @head_lpos and @tail_lpos for data rings. It is at index * 0 and the lpos value is such that it will overflow on the first wrap. * * DESC0_ID * The initial @head_id and @tail_id for the desc ring. It is at the last * index of the descriptor array (see Req3 above) and the ID value is such * that it will overflow on the second wrap. */ #define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) #define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) #define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) /* * Define a ringbuffer with an external text data buffer. The same as * DEFINE_PRINTKRB() but requires specifying an external buffer for the * text data. * * Note: The specified external buffer must be of the size: * 2 ^ (descbits + avgtextbits) */ #define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ /* reusable */ \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ .text_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ /* this will be the first record reserved by a writer */ \ [0] = { \ /* will be incremented to 0 on the first reservation */ \ .seq = -(u64)_DESCS_COUNT(descbits), \ }, \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ /* reports the first seq value during the bootstrap phase */ \ .seq = 0, \ }, \ }; \ static struct printk_ringbuffer name = { \ .desc_ring = { \ .count_bits = descbits, \ .descs = &_##name##_descs[0], \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ .data = text_buf, \ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ }, \ .fail = ATOMIC_LONG_INIT(0), \ } /** * DEFINE_PRINTKRB() - Define a ringbuffer. * * @name: The name of the ringbuffer variable. * @descbits: The number of descriptors as a power-of-2 value. * @avgtextbits: The average text data size per record as a power-of-2 value. * * This is a macro for defining a ringbuffer and all internal structures * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a * variant where the text data buffer can be specified externally. */ #define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ __aligned(__alignof__(unsigned long)); \ _DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) /* Writer Interface */ /** * prb_rec_init_wr() - Initialize a buffer for writing records. * * @r: The record to initialize. * @text_buf_size: The needed text buffer size. */ static inline void prb_rec_init_wr(struct printk_record *r, unsigned int text_buf_size) { r->info = NULL; r->text_buf = NULL; r->text_buf_size = text_buf_size; } bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r); bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id, unsigned int max_size); void prb_commit(struct prb_reserved_entry *e); void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, struct prb_desc *descs, unsigned int descs_count_bits, struct printk_info *infos); unsigned int prb_record_text_space(struct prb_reserved_entry *e); /* Reader Interface */ /** * prb_rec_init_rd() - Initialize a buffer for reading records. * * @r: The record to initialize. * @info: A buffer to store record meta-data. * @text_buf: A buffer to store text data. * @text_buf_size: The size of @text_buf. * * Initialize all the fields that a reader is interested in. All arguments * (except @r) are optional. Only record data for arguments that are * non-NULL or non-zero will be read. */ static inline void prb_rec_init_rd(struct printk_record *r, struct printk_info *info, char *text_buf, unsigned int text_buf_size) { r->info = info; r->text_buf = text_buf; r->text_buf_size = text_buf_size; } /** * prb_for_each_record() - Iterate over the records of a ringbuffer. * * @from: The sequence number to begin with. * @rb: The ringbuffer to iterate over. * @s: A u64 to store the sequence number on each iteration. * @r: A printk_record to store the record on each iteration. * * This is a macro for conveniently iterating over a ringbuffer. * Note that @s may not be the sequence number of the record on each * iteration. For the sequence number, @r->info->seq should be checked. * * Context: Any context. */ #define prb_for_each_record(from, rb, s, r) \ for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) /** * prb_for_each_info() - Iterate over the meta data of a ringbuffer. * * @from: The sequence number to begin with. * @rb: The ringbuffer to iterate over. * @s: A u64 to store the sequence number on each iteration. * @i: A printk_info to store the record meta data on each iteration. * @lc: An unsigned int to store the text line count of each record. * * This is a macro for conveniently iterating over a ringbuffer. * Note that @s may not be the sequence number of the record on each * iteration. For the sequence number, @r->info->seq should be checked. * * Context: Any context. */ #define prb_for_each_info(from, rb, s, i, lc) \ for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r); bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count); u64 prb_first_valid_seq(struct printk_ringbuffer *rb); u64 prb_next_seq(struct printk_ringbuffer *rb); #endif /* _KERNEL_PRINTK_RINGBUFFER_H */ |
361 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> #include <linux/string.h> #include <linux/types.h> #include <asm/mce.h> #ifdef CONFIG_X86_MCE static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) { static_branch_inc(©_mc_fragile_key); } #define copy_mc_fragile_enabled (static_branch_unlikely(©_mc_fragile_key)) /* * Similar to copy_user_handle_tail, probe for the write fault point, or * source exception point. */ __visible notrace unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) { for (; len; --len, to++, from++) if (copy_mc_fragile(to, from, 1)) break; return len; } #else /* * No point in doing careful copying, or consulting a static key when * there is no #MC handler in the CONFIG_X86_MCE=n case. */ void enable_copy_mc_fragile(void) { } #define copy_mc_fragile_enabled (0) #endif unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); /** * copy_mc_to_kernel - memory copy that handles source exceptions * * @dst: destination address * @src: source address * @len: number of bytes to copy * * Call into the 'fragile' version on systems that benefit from avoiding * corner case poison consumption scenarios, For example, accessing * poison across 2 cachelines with a single instruction. Almost all * other uses case can use copy_mc_enhanced_fast_string() for a fast * recoverable copy, or fallback to plain memcpy. * * Return 0 for success, or number of bytes not copied if there was an * exception. */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { if (copy_mc_fragile_enabled) return copy_mc_fragile(dst, src, len); if (static_cpu_has(X86_FEATURE_ERMS)) return copy_mc_enhanced_fast_string(dst, src, len); memcpy(dst, src, len); return 0; } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } return copy_user_generic((__force void *)dst, src, len); } |
116 101 314 313 1 324 288 313 27 4 9 9 939 801 101 3 504 58 207 1778 1080 3 941 690 1124 307 240 834 16 188 677 533 534 229 177 176 3 336 7 30 261 7 3 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IPV6_H #define _NET_IPV6_H #include <linux/ipv6.h> #include <linux/hardirq.h> #include <linux/jhash.h> #include <linux/refcount.h> #include <linux/jump_label_ratelimit.h> #include <net/if_inet6.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/inet_dscp.h> #include <net/snmp.h> #include <net/netns/hash.h> struct ip_tunnel_info; #define SIN6_LEN_RFC2133 24 #define IPV6_MAXPLEN 65535 /* * NextHeader field of IPv6 header */ #define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ #define NEXTHDR_IPV4 4 /* IPv4 in IPv6 */ #define NEXTHDR_TCP 6 /* TCP segment. */ #define NEXTHDR_UDP 17 /* UDP message. */ #define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ #define NEXTHDR_ROUTING 43 /* Routing header. */ #define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ #define NEXTHDR_GRE 47 /* GRE header. */ #define NEXTHDR_ESP 50 /* Encapsulating security payload. */ #define NEXTHDR_AUTH 51 /* Authentication header. */ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ #define NEXTHDR_SCTP 132 /* SCTP message. */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 /* Limits on Hop-by-Hop and Destination options. * * Per RFC8200 there is no limit on the maximum number or lengths of options in * Hop-by-Hop or Destination options other then the packet must fit in an MTU. * We allow configurable limits in order to mitigate potential denial of * service attacks. * * There are three limits that may be set: * - Limit the number of options in a Hop-by-Hop or Destination options * extension header * - Limit the byte length of a Hop-by-Hop or Destination options extension * header * - Disallow unknown options * * The limits are expressed in corresponding sysctls: * * ipv6.sysctl.max_dst_opts_cnt * ipv6.sysctl.max_hbh_opts_cnt * ipv6.sysctl.max_dst_opts_len * ipv6.sysctl.max_hbh_opts_len * * max_*_opts_cnt is the number of TLVs that are allowed for Destination * options or Hop-by-Hop options. If the number is less than zero then unknown * TLVs are disallowed and the number of known options that are allowed is the * absolute value. Setting the value to INT_MAX indicates no limit. * * max_*_opts_len is the length limit in bytes of a Destination or * Hop-by-Hop options extension header. Setting the value to INT_MAX * indicates no length limit. * * If a limit is exceeded when processing an extension header the packet is * silently discarded. */ /* Default limits for Hop-by-Hop and Destination options */ #define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 #define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ /* * Addr type * * type - unicast | multicast * scope - local | site | global * v4 - compat * v4mapped * any * loopback */ #define IPV6_ADDR_ANY 0x0000U #define IPV6_ADDR_UNICAST 0x0001U #define IPV6_ADDR_MULTICAST 0x0002U #define IPV6_ADDR_LOOPBACK 0x0010U #define IPV6_ADDR_LINKLOCAL 0x0020U #define IPV6_ADDR_SITELOCAL 0x0040U #define IPV6_ADDR_COMPATv4 0x0080U #define IPV6_ADDR_SCOPE_MASK 0x00f0U #define IPV6_ADDR_MAPPED 0x1000U /* * Addr scopes */ #define IPV6_ADDR_MC_SCOPE(a) \ ((a)->s6_addr[1] & 0x0f) /* nonstandard */ #define __IPV6_ADDR_SCOPE_INVALID -1 #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 #define IPV6_ADDR_SCOPE_GLOBAL 0x0e /* * Addr flags */ #define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ ((a)->s6_addr[1] & 0x10) #define IPV6_ADDR_MC_FLAG_PREFIX(a) \ ((a)->s6_addr[1] & 0x20) #define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ ((a)->s6_addr[1] & 0x40) /* * fragmentation header */ struct frag_hdr { __u8 nexthdr; __u8 reserved; __be16 frag_off; __be32 identification; }; /* * Jumbo payload option, as described in RFC 2675 2. */ struct hop_jumbo_hdr { u8 nexthdr; u8 hdrlen; u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */ u8 tlv_len; /* 4 */ __be32 jumbo_payload_len; }; #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 struct ip6_fraglist_iter { struct ipv6hdr *tmp_hdr; struct sk_buff *frag; int offset; unsigned int hlen; __be32 frag_id; u8 nexthdr; }; int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter); static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip6_frag_state { u8 *prevhdr; unsigned int hlen; unsigned int mtu; unsigned int left; int offset; int ptr; int hroom; int troom; __be32 frag_id; u8 nexthdr; }; void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state); #define IP6_REPLY_MARK(net, mark) \ ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0) #include <net/sock.h> /* sysctls */ extern int sysctl_mld_max_msf; extern int sysctl_mld_qrv; #define _DEVINC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\ mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\ }) /* per device counters are atomic_long_t */ #define _DEVINCATOMIC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\ }) /* per device and per net counters are atomic_long_t */ #define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\ }) #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ }) /* MIBs */ #define IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, , idev, field) #define __IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, __, idev, field) #define IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, , idev, field, val) #define __IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, __, idev, field, val) #define IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, , idev, field, val) #define __IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, __, idev, field, val) #define ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, , idev, field) #define __ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, __, idev, field) #define ICMP6MSGOUT_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256) #define ICMP6MSGIN_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field) struct ip6_ra_chain { struct ip6_ra_chain *next; struct sock *sk; int sel; void (*destructor)(struct sock *); }; extern struct ip6_ra_chain *ip6_ra_chain; extern rwlock_t ip6_ra_lock; /* This structure is prepared by protocol, when parsing ancillary data and passed to IPv6. */ struct ipv6_txoptions { refcount_t refcnt; /* Length of this structure */ int tot_len; /* length of extension headers */ __u16 opt_flen; /* after fragment hdr */ __u16 opt_nflen; /* before fragment hdr */ struct ipv6_opt_hdr *hopopt; struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; /* flowlabel_reflect sysctl values */ enum flowlabel_reflect { FLOWLABEL_REFLECT_ESTABLISHED = 1, FLOWLABEL_REFLECT_TCP_RESET = 2, FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES = 4, }; struct ip6_flowlabel { struct ip6_flowlabel __rcu *next; __be32 label; atomic_t users; struct in6_addr dst; struct ipv6_txoptions *opt; unsigned long linger; struct rcu_head rcu; u8 share; union { struct pid *pid; kuid_t uid; } owner; unsigned long lastuse; unsigned long expires; struct net *fl_net; }; #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) #define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF) #define IPV6_FLOWLABEL_STATELESS_FLAG cpu_to_be32(0x00080000) #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) #define IPV6_TCLASS_SHIFT 20 struct ipv6_fl_socklist { struct ipv6_fl_socklist __rcu *next; struct ip6_flowlabel *fl; struct rcu_head rcu; }; struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; __u16 gso_size; __s8 dontfrag; struct ipv6_txoptions *opt; }; static inline void ipcm6_init(struct ipcm6_cookie *ipc6) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = -1, .dontfrag = -1, }; } static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct sock *sk) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = inet6_sk(sk)->tclass, .dontfrag = inet6_test_bit(DONTFRAG, sk), }; } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) { struct ipv6_txoptions *opt; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { if (!refcount_inc_not_zero(&opt->refcnt)) opt = NULL; else opt = rcu_pointer_handoff(opt); } rcu_read_unlock(); return opt; } static inline void txopt_put(struct ipv6_txoptions *opt) { if (opt && refcount_dec_and_test(&opt->refcnt)) kfree_rcu(opt, rcu); } #if IS_ENABLED(CONFIG_IPV6) struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) && READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } #endif struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt); void fl6_free_socklist(struct sock *sk); int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen); int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); bool ip6_autoflowlabel(struct net *net, const struct sock *sk); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { if (fl) atomic_dec(&fl->users); } enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); int ipv6_parse_hopopts(struct sk_buff *skb); struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt); struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); static inline struct ipv6_txoptions * ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { if (!opt) return NULL; return __ipv6_fixup_options(opt_space, opt); } bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); /* This helper is specialized for BIG TCP needs. * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. * It assumes headers are already in skb->head. * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. */ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) { const struct hop_jumbo_hdr *jhdr; const struct ipv6hdr *nhdr; if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) return 0; if (skb->protocol != htons(ETH_P_IPV6)) return 0; if (skb_network_offset(skb) + sizeof(struct ipv6hdr) + sizeof(struct hop_jumbo_hdr) > skb_headlen(skb)) return 0; nhdr = ipv6_hdr(skb); if (nhdr->nexthdr != NEXTHDR_HOP) return 0; jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1); if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 || jhdr->nexthdr != IPPROTO_TCP) return 0; return jhdr->nexthdr; } /* Return 0 if HBH header is successfully removed * Or if HBH removal is unnecessary (packet is not big TCP) * Return error to indicate dropping the packet */ static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) { const int hophdr_len = sizeof(struct hop_jumbo_hdr); int nexthdr = ipv6_has_hopopt_jumbo(skb); struct ipv6hdr *h6; if (!nexthdr) return 0; if (skb_cow_head(skb, 0)) return -1; /* Remove the HBH header. * Layout: [Ethernet header][IPv6 header][HBH][L4 Header] */ memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb), skb_network_header(skb) - skb_mac_header(skb) + sizeof(struct ipv6hdr)); __skb_pull(skb, hophdr_len); skb->network_header += hophdr_len; skb->mac_header += hophdr_len; h6 = ipv6_hdr(skb); h6->nexthdr = nexthdr; return 0; } static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 : idev->cnf.accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ int __ipv6_addr_type(const struct in6_addr *addr); static inline int ipv6_addr_type(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & 0xffff; } static inline int ipv6_addr_scope(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; } static inline int __ipv6_addr_src_scope(int type) { return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); } static inline int ipv6_addr_src_scope(const struct in6_addr *addr) { return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } static inline bool __ipv6_addr_needs_scope_id(int type) { return type & IPV6_ADDR_LINKLOCAL || (type & IPV6_ADDR_MULTICAST && (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); } static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) { return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; } static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) { return memcmp(a1, a2, sizeof(struct in6_addr)); } static inline bool ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ulm = (const unsigned long *)m; const unsigned long *ul2 = (const unsigned long *)a2; return !!(((ul1[0] ^ ul2[0]) & ulm[0]) | ((ul1[1] ^ ul2[1]) & ulm[1])); #else return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); #endif } static inline void ipv6_addr_prefix(struct in6_addr *pfx, const struct in6_addr *addr, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); memcpy(pfx->s6_addr, addr, o); if (b != 0) pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, const struct in6_addr *pfx, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memcpy(addr->s6_addr, pfx, o); if (b != 0) { addr->s6_addr[o] &= ~(0xff00 >> b); addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); } } static inline void __ipv6_addr_set_half(__be32 *addr, __be32 wh, __be32 wl) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #if defined(__BIG_ENDIAN) if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) { *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl)); return; } #elif defined(__LITTLE_ENDIAN) if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) { *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh)); return; } #endif #endif addr[0] = wh; addr[1] = wl; } static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4) { __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2); __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4); } static inline bool ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; #endif } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline bool __ipv6_prefix_equal64_half(const __be64 *a1, const __be64 *a2, unsigned int len) { if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len)))) return false; return true; } static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be64 *a1 = (const __be64 *)addr1; const __be64 *a2 = (const __be64 *)addr2; if (prefixlen >= 64) { if (a1[0] ^ a2[0]) return false; return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64); } return __ipv6_prefix_equal64_half(a1, a2, prefixlen); } #else static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be32 *a1 = addr1->s6_addr32; const __be32 *a2 = addr2->s6_addr32; unsigned int pdw, pbi; /* check complete u32 in prefix */ pdw = prefixlen >> 5; if (pdw && memcmp(a1, a2, pdw << 2)) return false; /* check incomplete u32 in prefix */ pbi = prefixlen & 0x1f; if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) return false; return true; } #endif static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; return (ul[0] | ul[1]) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]) == 0; #endif } static inline u32 ipv6_addr_hash(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; unsigned long x = ul[0] ^ ul[1]; return (u32)(x ^ (x >> 32)); #else return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^ a->s6_addr32[2] ^ a->s6_addr32[3]); #endif } /* more secured version of ipv6_addr_hash() */ static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { return jhash2((__force const u32 *)a->s6_addr32, ARRAY_SIZE(a->s6_addr32), initval); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const __be64 *be = (const __be64 *)a; return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0; #endif } /* * Note that we must __force cast these to unsigned long to make sparse happy, * since all of the endian-annotated types are fixed size regardless of arch. */ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) { return ( #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 *(unsigned long *)a | #else (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | #endif (__force unsigned long)(a->s6_addr32[2] ^ cpu_to_be32(0x0000ffff))) == 0UL; } static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); } static inline u32 ipv6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) { unsigned int hash, mix = net_hash_mix(net); if (ipv6_addr_any(addr6)) hash = jhash_1word(0, mix); else if (ipv6_addr_v4mapped(addr6)) hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); else hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); return hash ^ port; } /* * Check for a RFC 4843 ORCHID address * (Overlay Routable Cryptographic Hash Identifiers) */ static inline bool ipv6_addr_orchid(const struct in6_addr *a) { return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); } static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr) { return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); } static inline void ipv6_addr_set_v4mapped(const __be32 addr, struct in6_addr *v4mapped) { ipv6_addr_set(v4mapped, 0, 0, htonl(0x0000FFFF), addr); } /* * find the first different bit between two addresses * length of address must be a multiple of 32bits */ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen) { const __be32 *a1 = token1, *a2 = token2; int i; addrlen >>= 2; for (i = 0; i < addrlen; i++) { __be32 xb = a1[i] ^ a2[i]; if (xb) return i * 32 + 31 - __fls(ntohl(xb)); } /* * we should *never* get to this point since that * would mean the addrs are equal * * However, we do get to it 8) And exacly, when * addresses are equal 8) * * ip route add 1111::/128 via ... * ip route add 1111::/64 via ... * and we are here. * * Ideally, this function should stop comparison * at prefix length. It does not, but it is still OK, * if returned value is greater than prefix length. * --ANK (980803) */ return addrlen << 5; } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen) { const __be64 *a1 = token1, *a2 = token2; int i; addrlen >>= 3; for (i = 0; i < addrlen; i++) { __be64 xb = a1[i] ^ a2[i]; if (xb) return i * 64 + 63 - __fls(be64_to_cpu(xb)); } return addrlen << 6; } #endif static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 if (__builtin_constant_p(addrlen) && !(addrlen & 7)) return __ipv6_addr_diff64(token1, token2, addrlen); #endif return __ipv6_addr_diff32(token1, token2, addrlen); } static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) { return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, struct dst_entry *dst) { int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) hlimit = READ_ONCE(np->mcast_hops); else hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; } /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, const struct ipv6hdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) != offsetof(typeof(flow->addrs), v6addrs.src) + sizeof(flow->addrs.v6addrs.src)); memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } #if IS_ENABLED(CONFIG_IPV6) static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF 0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 #define IP6_AUTO_FLOW_LABEL_OPTIN 2 #define IP6_AUTO_FLOW_LABEL_FORCED 3 #define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED #define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { u32 hash; /* @flowlabel may include more than a flow label, eg, the traffic class. * Here we want only the flow label value. */ flowlabel &= IPV6_FLOWLABEL_MASK; if (flowlabel || net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || (!autolabel && net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) return flowlabel; hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; if (net->ipv6.sysctl.flowlabel_state_ranges) flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { switch (net->ipv6.sysctl.auto_flowlabels) { case IP6_AUTO_FLOW_LABEL_OFF: case IP6_AUTO_FLOW_LABEL_OPTIN: default: return 0; case IP6_AUTO_FLOW_LABEL_OPTOUT: case IP6_AUTO_FLOW_LABEL_FORCED: return 1; } } #else static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { return 0; } #endif #if IS_ENABLED(CONFIG_IPV6) static inline int ip6_multipath_hash_policy(const struct net *net) { return net->ipv6.sysctl.multipath_hash_policy; } static inline u32 ip6_multipath_hash_fields(const struct net *net) { return net->ipv6.sysctl.multipath_hash_fields; } #else static inline int ip6_multipath_hash_policy(const struct net *net) { return 0; } static inline u32 ip6_multipath_hash_fields(const struct net *net) { return 0; } #endif /* * Header manipulation */ static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, __be32 flowlabel) { *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; } static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWINFO_MASK; } static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; } static inline u8 ip6_tclass(__be32 flowinfo) { return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } static inline dscp_t ip6_dscp(__be32 flowinfo) { return inet_dsfield_to_dscp(ip6_tclass(flowinfo)); } static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) { return fl6->flowlabel & IPV6_FLOWLABEL_MASK; } /* * Prototypes exported by ipv6 */ /* * rcv function (called from netdevice level) */ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); /* * upper-layer output functions */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); int ip6_push_pending_frames(struct sock *sk); void ip6_flush_pending_frames(struct sock *sk); int ip6_send_skb(struct sk_buff *skb); struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); /* * skb processing functions */ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_forward(struct sk_buff *skb); int ip6_input(struct sk_buff *skb); int ip6_mc_input(struct sk_buff *skb); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final); int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); /* * Extension header (options) processing */ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr_p, struct in6_addr *saddr); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto); int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, __be16 *frag_offp); bool ipv6_ext_hdr(u8 nexthdr); enum { IP6_FH_F_FRAG = (1 << 0), IP6_FH_F_AUTH = (1 << 1), IP6_FH_F_SKIP_RH = (1 << 2), }; /* find specified header and get offset to it */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *fragflg); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type); struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig); /* * socket options (ipv6_sockglue.c) */ DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount); int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); /* * reassembly.c */ extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; extern const struct proto_ops inet6_sockraw_ops; struct group_source_req; struct group_filter; int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); void ac6_proc_exit(struct net *net); int raw6_proc_init(void); void raw6_proc_exit(void); int tcp6_proc_init(struct net *net); void tcp6_proc_exit(struct net *net); int udp6_proc_init(struct net *net); void udp6_proc_exit(struct net *net); int udplite6_proc_init(void); void udplite6_proc_exit(void); int ipv6_misc_proc_init(void); void ipv6_misc_proc_exit(void); int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); #else static inline int ac6_proc_init(struct net *net) { return 0; } static inline void ac6_proc_exit(struct net *net) { } static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; } static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; } #endif #ifdef CONFIG_SYSCTL struct ctl_table *ipv6_icmp_sysctl_init(struct net *net); size_t ipv6_icmp_sysctl_table_size(void); struct ctl_table *ipv6_route_sysctl_init(struct net *net); size_t ipv6_route_sysctl_table_size(struct net *net); int ipv6_sysctl_register(void); void ipv6_sysctl_unregister(void); #endif int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); static inline int ip6_sock_set_v6only(struct sock *sk) { if (inet_sk(sk)->inet_num) return -EINVAL; lock_sock(sk); sk->sk_ipv6only = true; release_sock(sk); return 0; } static inline void ip6_sock_set_recverr(struct sock *sk) { inet6_set_bit(RECVERR6, sk); } #define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \ IPV6_PREFER_SRC_COA) static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val) { unsigned int prefmask = ~IPV6_PREFER_SRC_MASK; unsigned int pref = 0; /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ switch (val & (IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { case IPV6_PREFER_SRC_PUBLIC: pref |= IPV6_PREFER_SRC_PUBLIC; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_TMP: pref |= IPV6_PREFER_SRC_TMP; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_PUBTMP_DEFAULT: prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case 0: break; default: return -EINVAL; } /* check HOME/COA conflicts */ switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { case IPV6_PREFER_SRC_HOME: prefmask &= ~IPV6_PREFER_SRC_COA; break; case IPV6_PREFER_SRC_COA: pref |= IPV6_PREFER_SRC_COA; break; case 0: break; default: return -EINVAL; } /* check CGA/NONCGA conflicts */ switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { case IPV6_PREFER_SRC_CGA: case IPV6_PREFER_SRC_NONCGA: case 0: break; default: return -EINVAL; } WRITE_ONCE(inet6_sk(sk)->srcprefs, (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref); return 0; } static inline void ip6_sock_set_recvpktinfo(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->rxopt.bits.rxinfo = true; release_sock(sk); } #endif /* _NET_IPV6_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ #include <linux/mmzone.h> #include <linux/range.h> #include <linux/ioport.h> #include <linux/percpu-refcount.h> struct resource; struct device; /** * struct vmem_altmap - pre-allocated storage for vmemmap_populate * @base_pfn: base of the entire dev_pagemap mapping * @reserve: pages mapped, but reserved for driver use (relative to @base) * @free: free pages set aside in the mapping for memmap storage * @align: pages reserved to meet allocation alignments * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; unsigned long align; unsigned long alloc; }; /* * Specialize ZONE_DEVICE memory into multiple types each has a different * usage. * * MEMORY_DEVICE_PRIVATE: * Device memory that is not directly addressable by the CPU: CPU can neither * read nor write private memory. In this case, we do still have struct pages * backing the device memory. Doing so simplifies the implementation, but it is * important to remember that there are certain points at which the struct page * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/mm/hmm.rst. * * MEMORY_DEVICE_COHERENT: * Device memory that is cache coherent from device and CPU point of view. This * is used on platforms that have an advanced system bus (like CAPI or CXL). A * driver can hotplug the device memory using ZONE_DEVICE and with that memory * type. Any page of a process can be migrated to such memory. However no one * should be allowed to pin such memory so that it can always be evicted. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a * wakeup event whenever a page is unpinned and becomes idle. This * wakeup is used to coordinate physical address space management (ex: * fs truncate/hole punch) vs pinned pages (ex: device dma). * * MEMORY_DEVICE_GENERIC: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. This is for example used by DAX devices * that expose memory using a character device. * * MEMORY_DEVICE_PCI_P2PDMA: * Device memory residing in a PCI BAR intended for use with Peer-to-Peer * transactions. */ enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, }; struct dev_pagemap_ops { /* * Called once the page refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare * for handing out the page again. */ void (*page_free)(struct page *page); /* * Used for private (un-addressable) device memory only. Must migrate * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); /* * Handle the memory failure happens on a range of pfns. Notify the * processes who are using these pfns, and try to recover the data on * them if necessary. The mf_flags is finally passed to the recover * function through the whole notify routine. * * When this is not implemented, or it returns -EOPNOTSUPP, the caller * will fall back to a common handler called mf_generic_kill_procs(). */ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, unsigned long nr_pages, int mf_flags); }; #define PGMAP_ALTMAP_VALID (1 << 0) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @done: completion for @ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @vmemmap_shift: structural definition of how the vmemmap page metadata * is populated, specifically the metadata page order. * A zero value (default) uses base pages as the vmemmap metadata * representation. A bigger value will set up compound struct pages * of the requested order value. * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. * @nr_range: number of ranges to be mapped * @range: range to be mapped when nr_range == 1 * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { struct vmem_altmap altmap; struct percpu_ref ref; struct completion done; enum memory_type type; unsigned int flags; unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; int nr_range; union { struct range range; DECLARE_FLEX_ARRAY(struct range, ranges); }; }; static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap) { return pgmap->ops && pgmap->ops->memory_failure; } static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) { if (pgmap->flags & PGMAP_ALTMAP_VALID) return &pgmap->altmap; return NULL; } static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) { return 1 << pgmap->vmemmap_shift; } static inline bool is_device_private_page(const struct page *page) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool folio_is_device_private(const struct folio *folio) { return is_device_private_page(&folio->page); } static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } static inline bool is_device_coherent_page(const struct page *page) { return is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_COHERENT; } static inline bool folio_is_device_coherent(const struct folio *folio) { return is_device_coherent_page(&folio->page); } #ifdef CONFIG_ZONE_DEVICE void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { /* * Fail attempts to call devm_memremap_pages() without * ZONE_DEVICE support enabled, this requires callers to fall * back to plain devm_memremap() based on config */ WARN_ON_ONCE(1); return ERR_PTR(-ENXIO); } static inline void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) { } static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap) { return NULL; } static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) { return false; } /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) percpu_ref_put(&pgmap->ref); } #endif /* _LINUX_MEMREMAP_H_ */ |
1541 1143 1143 231 91 231 4530 4529 4531 4531 2088 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif #ifdef CONFIG_HIGHPTE #define PGTABLE_HIGHMEM __GFP_HIGHMEM #else #define PGTABLE_HIGHMEM 0 #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { tlb_remove_page(tlb, table); } #endif gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, __userpte_alloc_gfp); } static int __init setup_userpte(char *arg) { if (!arg) return -EINVAL; /* * "userpte=nohigh" disables allocation of user pagetables in * high memory. */ if (strcmp(arg, "nohigh") == 0) __userpte_alloc_gfp &= ~__GFP_HIGHMEM; else return -EINVAL; return 0; } early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. */ #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif pagetable_pmd_dtor(ptdesc); paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { struct ptdesc *ptdesc = virt_to_ptdesc(pud); pagetable_pud_dtor(ptdesc); paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_del(&ptdesc->pt_list); } #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) #define MAX_UNSHARED_PTRS_PER_PGD \ max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ if (CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || CONFIG_PGTABLE_LEVELS >= 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ if (!SHARED_KERNEL_PMD) { pgd_set_mm(pgd, mm); pgd_list_add(pgd); } } static void pgd_dtor(pgd_t *pgd) { if (SHARED_KERNEL_PMD) return; spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); } /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the * kernel pmd is shared. If PAE were not to share the pmd a similar * tactic would be needed. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. * -- nyc */ #ifdef CONFIG_X86_PAE /* * In PAE mode, we need to do a cr3 reload (=tlb flush) when * updating the top-level pagetable entries to guarantee the * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. * * Also, if we're in a paravirt environment where the kernel pmd is * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate * and initialize the kernel pmds here. */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD /* * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 #define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); pagetable_pmd_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { pmd_t *pmd = NULL; struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) failed = true; if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; } if (ptdesc) { mm_inc_nr_pmds(mm); pmd = ptdesc_address(ptdesc); } pmds[i] = pmd; } if (failed) { free_pmds(mm, pmds, count); return -ENOMEM; } return 0; } /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be * freed manually. */ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) { pgd_t pgd = *pgdp; if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); } } static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); #ifdef CONFIG_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); for (i = 0; i < PREALLOCATED_USER_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); #endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { p4d_t *p4d; pud_t *pud; int i; p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); } } #ifdef CONFIG_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); p4d_t *u_p4d; pud_t *u_pud; int i; u_p4d = p4d_offset(u_pgd, 0); u_pud = pud_offset(u_p4d, 0); s_pgd += KERNEL_PGD_BOUNDARY; u_pud += KERNEL_PGD_BOUNDARY; for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { pmd_t *pmd = pmds[i]; memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, u_pud, pmd); } } #else static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { } #endif /* * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also * assumes that pgd should be in one page. * * But kernel with PAE paging that is not running as a Xen domain * only needs to allocate 32 bytes for pgd instead of one page. */ #ifdef CONFIG_X86_PAE #include <linux/slab.h> #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #define PGD_ALIGN 32 static struct kmem_cache *pgd_cache; void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use * shared kernel pmd. And this requires a whole page for pgd. */ if (!SHARED_KERNEL_PMD) return; /* * when PAE kernel is not running as a Xen domain, it uses * shared kernel pmd. Shared kernel pmd does not require a whole * page for pgd. We are able to just allocate a 32-byte for pgd. * During boot time, we create a 32-byte slab for pgd table allocation. */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); } static inline pgd_t *_pgd_alloc(void) { /* * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate * a 32-byte slab for pgd to save memory space. */ return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); } static inline void _pgd_free(pgd_t *pgd) { if (!SHARED_KERNEL_PMD) free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); else kmem_cache_free(pgd_cache, pgd); } #else static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[MAX_PREALLOCATED_PMDS]; pgd = _pgd_alloc(); if (pgd == NULL) goto out; mm->pgd = pgd; if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; if (sizeof(u_pmds) != 0 && preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) goto out_free_user_pmds; /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ spin_lock(&pgd_lock); pgd_ctor(mm, pgd); if (sizeof(pmds) != 0) pgd_prepopulate_pmd(mm, pgd, pmds); if (sizeof(u_pmds) != 0) pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); _pgd_free(pgd); } /* * Used to set accessed or dirty bits in the page table entries * on other architectures. On x86, the accessed and dirty bits * are tracked by hardware. However, do_wp_page calls this function * to also make the pte writeable at the same time the dirty bit is * set. In that case we do actually need to write the PTE. */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed && dirty) set_pte(ptep, entry); return changed; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { int changed = !pud_same(*pudp, entry); VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int ret = 0; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); return ret; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int ret = 0; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); return ret; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { int ret = 0; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pudp); return ret; } #endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush * doesn't cause data corruption. [ It could cause incorrect * page aging and the (mistaken) reclaim of hot pages, but the * chance of that should be relatively low. ] * * So as a performance optimization don't flush the TLB when * clearing the accessed bit, it will eventually be flushed by * a context switch or a VM operation anyway. [ In the rare * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] */ return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* * No flush is necessary. Once an invalid PTE is established, the PTE's * access and dirty bits cannot be updated. */ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); } #endif /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top * of kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); #ifdef CONFIG_X86_64 /* * Ensure that the static initial page tables are covering the * fixmap completely. */ BUILD_BUG_ON(__end_of_permanent_fixed_addresses > (FIXMAP_PMD_NUM * PTRS_PER_PTE)); #endif if (idx >= __end_of_fixed_addresses) { BUG(); return; } set_pte_vaddr(address, pte); fixmaps_set++; } void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifdef CONFIG_X86_5LEVEL /** * p4d_set_huge - setup kernel P4D mapping * * No 512GB pages yet -- always return 0 */ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } /** * p4d_clear_huge - clear kernel P4D mapping when it is set * * No 512GB pages yet -- always return 0 */ void p4d_clear_huge(p4d_t *p4d) { } #endif /** * pud_set_huge - setup kernel PUD mapping * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ if (pud_present(*pud) && !pud_huge(*pud)) return 0; set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pmd_set_huge - setup kernel PMD mapping * * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; } /* Bail out if we are we on a populated non-leaf entry: */ if (pmd_present(*pmd) && !pmd_huge(*pmd)) return 0; set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pud_clear_huge - clear kernel PUD mapping when it is set * * Returns 1 on success and 0 on failure (no PUD map is found). */ int pud_clear_huge(pud_t *pud) { if (pud_large(*pud)) { pud_clear(pud); return 1; } return 0; } /** * pmd_clear_huge - clear kernel PMD mapping when it is set * * Returns 1 on success and 0 on failure (no PMD map is found). */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_large(*pmd)) { pmd_clear(pmd); return 1; } return 0; } #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. * @pud: Pointer to a PUD. * @addr: Virtual address associated with pud. * * Context: The pud range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. */ int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; pte_t *pte; int i; pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; for (i = 0; i < PTRS_PER_PMD; i++) { pmd_sv[i] = pmd[i]; if (!pmd_none(pmd[i])) pmd_clear(&pmd[i]); } pud_clear(pud); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); free_page((unsigned long)pte); } } free_page((unsigned long)pmd_sv); pagetable_pmd_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; } /** * pmd_free_pte_page - Clear pmd entry and free pte page. * @pmd: Pointer to a PMD. * @addr: Virtual address associated with pmd. * * Context: The pmd range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); free_page((unsigned long)pte); return 1; } #else /* !CONFIG_X86_64 */ /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd pmd entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return pmd_none(*pmd); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pte_mkwrite_shstk(pte); pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pmd_mkwrite_shstk(pmd); pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); } void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { /* * Hardware before shadow stack can (rarely) set Dirty=1 * on a Write=0 PTE. So the below condition * only indicates a software bug when shadow stack is * supported by the HW. This checking is covered in * pte_shstk(). */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pte_shstk(pte)); } void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } |
2641 2643 2643 2920 2922 2922 1666 2914 2642 2643 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/mempool.h> #include <linux/slab.h> #define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools) #define SG_MEMPOOL_SIZE 2 struct sg_pool { size_t size; char *name; struct kmem_cache *slab; mempool_t *pool; }; #define SP(x) { .size = x, "sgpool-" __stringify(x) } #if (SG_CHUNK_SIZE < 32) #error SG_CHUNK_SIZE is too small (must be 32 or greater) #endif static struct sg_pool sg_pools[] = { SP(8), SP(16), #if (SG_CHUNK_SIZE > 32) SP(32), #if (SG_CHUNK_SIZE > 64) SP(64), #if (SG_CHUNK_SIZE > 128) SP(128), #if (SG_CHUNK_SIZE > 256) #error SG_CHUNK_SIZE is too large (256 MAX) #endif #endif #endif #endif SP(SG_CHUNK_SIZE) }; #undef SP static inline unsigned int sg_pool_index(unsigned short nents) { unsigned int index; BUG_ON(nents > SG_CHUNK_SIZE); if (nents <= 8) index = 0; else index = get_count_order(nents) - 3; return index; } static void sg_pool_free(struct scatterlist *sgl, unsigned int nents) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); mempool_free(sgl, sgp->pool); } static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); return mempool_alloc(sgp->pool, gfp_mask); } /** * sg_free_table_chained - Free a previously mapped sg table * @table: The sg table header to use * @nents_first_chunk: size of the first_chunk SGL passed to * sg_alloc_table_chained * * Description: * Free an sg table previously allocated and setup with * sg_alloc_table_chained(). * * @nents_first_chunk has to be same with that same parameter passed * to sg_alloc_table_chained(). * **/ void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk) { if (table->orig_nents <= nents_first_chunk) return; if (nents_first_chunk == 1) nents_first_chunk = 0; __sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free, table->orig_nents); } EXPORT_SYMBOL_GPL(sg_free_table_chained); /** * sg_alloc_table_chained - Allocate and chain SGLs in an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @first_chunk: first SGL * @nents_first_chunk: number of the SGL of @first_chunk * * Description: * Allocate and chain SGLs in an sg table. If @nents@ is larger than * @nents_first_chunk a chained sg table will be setup. @first_chunk is * ignored if nents_first_chunk <= 1 because user expects the SGL points * non-chain SGL. * **/ int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk) { int ret; BUG_ON(!nents); if (first_chunk && nents_first_chunk) { if (nents <= nents_first_chunk) { table->nents = table->orig_nents = nents; sg_init_table(table->sgl, nents); return 0; } } /* User supposes that the 1st SGL includes real entry */ if (nents_first_chunk <= 1) { first_chunk = NULL; nents_first_chunk = 0; } ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE, first_chunk, nents_first_chunk, GFP_ATOMIC, sg_pool_alloc); if (unlikely(ret)) sg_free_table_chained(table, nents_first_chunk); return ret; } EXPORT_SYMBOL_GPL(sg_alloc_table_chained); static __init int sg_pool_init(void) { int i; for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; int size = sgp->size * sizeof(struct scatterlist); sgp->slab = kmem_cache_create(sgp->name, size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!sgp->slab) { printk(KERN_ERR "SG_POOL: can't init sg slab %s\n", sgp->name); goto cleanup_sdb; } sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, sgp->slab); if (!sgp->pool) { printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n", sgp->name); goto cleanup_sdb; } } return 0; cleanup_sdb: for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; mempool_destroy(sgp->pool); kmem_cache_destroy(sgp->slab); } return -ENOMEM; } subsys_initcall(sg_pool_init); |
30 2483 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IVERSION_H #define _LINUX_IVERSION_H #include <linux/fs.h> /* * The inode->i_version field: * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must * appear larger to observers if there was an explicit change to the inode's * data or metadata since it was last queried. * * An explicit change is one that would ordinarily result in a change to the * inode status change time (aka ctime). i_version must appear to change, even * if the ctime does not (since the whole point is to avoid missing updates due * to timestamp granularity). If POSIX or other relevant spec mandates that the * ctime must change due to an operation, then the i_version counter must be * incremented as well. * * Making the i_version update completely atomic with the operation itself would * be prohibitively expensive. Traditionally the kernel has updated the times on * directories after an operation that changes its contents. For regular files, * the ctime is usually updated before the data is copied into the cache for a * write. This means that there is a window of time when an observer can * associate a new timestamp with old file contents. Since the purpose of the * i_version is to allow for better cache coherency, the i_version must always * be updated after the results of the operation are visible. Updating it before * and after a change is also permitted. (Note that no filesystems currently do * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the * inode. If it's different then something has changed. Observers cannot infer * anything about the nature or magnitude of the changes from the value, only * that the inode has changed in some fashion. * * Not all filesystems properly implement the i_version counter. Subsystems that * want to use i_version field on an inode should first check whether the * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro). * * Those that set SB_I_VERSION will automatically have their i_version counter * incremented on writes to normal files. If the SB_I_VERSION is not set, then * the VFS will not touch it on writes, and the filesystem can use it how it * wishes. Note that the filesystem is always responsible for updating the * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.). * We consider these sorts of filesystems to have a kernel-managed i_version. * * It may be impractical for filesystems to keep i_version updates atomic with * respect to the changes that cause them. They should, however, guarantee * that i_version updates are never visible before the changes that caused * them. Also, i_version updates should never be delayed longer than it takes * the original change to reach disk. * * This implementation uses the low bit in the i_version field as a flag to * track when the value has been queried. If it has not been queried since it * was last incremented, we can skip the increment in most cases. * * In the event that we're updating the ctime, we will usually go ahead and * bump the i_version anyway. Since that has to go to stable storage in some * fashion, we might as well increment it as well. * * With this implementation, the value should always appear to observers to * increase over time if the file has changed. It's recommended to use * inode_eq_iversion() helper to compare values. * * Note that some filesystems (e.g. NFS and AFS) just use the field to store * a server-provided value (for the most part). For that reason, those * filesystems do not set SB_I_VERSION. These filesystems are considered to * have a self-managed i_version. * * Persistently storing the i_version * ---------------------------------- * Queries of the i_version field are not gated on them hitting the backing * store. It's always possible that the host could crash after allowing * a query of the value but before it has made it to disk. * * To mitigate this problem, filesystems should always use * inode_set_iversion_queried when loading an existing inode from disk. This * ensures that the next attempted inode increment will result in the value * changing. * * Storing the value to disk therefore does not count as a query, so those * filesystems should use inode_peek_iversion to grab the value to be stored. * There is no need to flag the value as having been queried in that case. */ /* * We borrow the lowest bit in the i_version to use as a flag to tell whether * it has been queried since we last incremented it. If it has, then we must * increment it on the next change. After that, we can clear the flag and * avoid incrementing it again until it has again been queried. */ #define I_VERSION_QUERIED_SHIFT (1) #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1)) #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT) /** * inode_set_iversion_raw - set i_version to the specified raw value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for use by * filesystems that self-manage the i_version. * * For example, the NFS client stores its NFSv4 change attribute in this way, * and the AFS client stores the data_version from the server here. */ static inline void inode_set_iversion_raw(struct inode *inode, u64 val) { atomic64_set(&inode->i_version, val); } /** * inode_peek_iversion_raw - grab a "raw" iversion value * @inode: inode from which i_version should be read * * Grab a "raw" inode->i_version value and return it. The i_version is not * flagged or converted in any way. This is mostly used to access a self-managed * i_version. * * With those filesystems, we want to treat the i_version as an entirely * opaque value. */ static inline u64 inode_peek_iversion_raw(const struct inode *inode) { return atomic64_read(&inode->i_version); } /** * inode_set_max_iversion_raw - update i_version new value is larger * @inode: inode to set * @val: new i_version to set * * Some self-managed filesystems (e.g Ceph) will only update the i_version * value if the new value is larger than the one we already have. */ static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { u64 cur = inode_peek_iversion_raw(inode); do { if (cur > val) break; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** * inode_set_iversion - set i_version to a particular value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for filesystems with * a kernel-managed i_version, for initializing a newly-created inode from * scratch. * * In this case, we do not set the QUERIED flag since we know that this value * has never been queried. */ static inline void inode_set_iversion(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT); } /** * inode_set_iversion_queried - set i_version to a particular value as quereied * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val, and flag it for increment on the next * change. * * Filesystems that persistently store the i_version on disk should use this * when loading an existing inode from disk. * * When loading in an i_version value from a backing store, we can't be certain * that it wasn't previously viewed before being stored. Thus, we must assume * that it was, to ensure that we don't end up handing out the same value for * different versions of the same inode. */ static inline void inode_set_iversion_queried(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) | I_VERSION_QUERIED); } bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version * @inode: inode that needs to be updated * * Forcbily increment the i_version field. This always results in a change to * the observable value. */ static inline void inode_inc_iversion(struct inode *inode) { inode_maybe_inc_iversion(inode, true); } /** * inode_iversion_need_inc - is the i_version in need of being incremented? * @inode: inode to check * * Returns whether the inode->i_version counter needs incrementing on the next * change. Just fetch the value and check the QUERIED flag. */ static inline bool inode_iversion_need_inc(struct inode *inode) { return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED; } /** * inode_inc_iversion_raw - forcibly increment raw i_version * @inode: inode that needs to be updated * * Forcbily increment the raw i_version field. This always results in a change * to the raw value. * * NFS will use the i_version field to store the value from the server. It * mostly treats it as opaque, but in the case where it holds a write * delegation, it must increment the value itself. This function does that. */ static inline void inode_inc_iversion_raw(struct inode *inode) { atomic64_inc(&inode->i_version); } /** * inode_peek_iversion - read i_version without flagging it to be incremented * @inode: inode from which i_version should be read * * Read the inode i_version counter for an inode without registering it as a * query. * * This is typically used by local filesystems that need to store an i_version * on disk. In that situation, it's not necessary to flag it as having been * viewed, as the result won't be used to gauge changes from that point. */ static inline u64 inode_peek_iversion(const struct inode *inode) { return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: */ static inline u64 time_to_chattr(const struct timespec64 *t) { u64 chattr = t->tv_sec; chattr <<= 32; chattr += t->tv_nsec; return chattr; } u64 inode_query_iversion(struct inode *inode); /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare the current raw i_version counter with a previous one. Returns true * if they are the same or false if they are different. */ static inline bool inode_eq_iversion_raw(const struct inode *inode, u64 old) { return inode_peek_iversion_raw(inode) == old; } /** * inode_eq_iversion - check whether the i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare an i_version counter with a previous one. Returns true if they are * the same, and false if they are different. * * Note that we don't need to set the QUERIED flag in this case, as the value * in the inode is not being recorded for later use. */ static inline bool inode_eq_iversion(const struct inode *inode, u64 old) { return inode_peek_iversion(inode) == old; } #endif |
34 36 1176 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* * include/linux/ktime.h * * ktime_t - nanosecond-resolution time format. * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes and macros. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * * Roman Zippel provided the ideas and primary code snippets of * the ktime_t union and further simplifications of the original * code. * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_KTIME_H #define _LINUX_KTIME_H #include <asm/bug.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/types.h> /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value * @secs: seconds to set * @nsecs: nanoseconds to set * * Return: The ktime_t representation of the value. */ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) { if (unlikely(secs >= KTIME_SEC_MAX)) return KTIME_MAX; return secs * NSEC_PER_SEC + (s64)nsecs; } /* Subtract two ktime_t variables. rem = lhs -rhs: */ #define ktime_sub(lhs, rhs) ((lhs) - (rhs)) /* Add two ktime_t variables. res = lhs + rhs: */ #define ktime_add(lhs, rhs) ((lhs) + (rhs)) /* * Same as ktime_add(), but avoids undefined behaviour on overflow; however, * this means that you must check the result for overflow yourself. */ #define ktime_add_unsafe(lhs, rhs) ((u64) (lhs) + (rhs)) /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: */ #define ktime_add_ns(kt, nsval) ((kt) + (nsval)) /* * Subtract a scalar nanosecod from a ktime_t variable * res = kt - nsval: */ #define ktime_sub_ns(kt, nsval) ((kt) - (nsval)) /* convert a timespec64 to ktime_t format: */ static inline ktime_t timespec64_to_ktime(struct timespec64 ts) { return ktime_set(ts.tv_sec, ts.tv_nsec); } /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec64(kt) ns_to_timespec64((kt)) /* Convert ktime_t to nanoseconds */ static inline s64 ktime_to_ns(const ktime_t kt) { return kt; } /** * ktime_compare - Compares two ktime_t variables for less, greater or equal * @cmp1: comparable1 * @cmp2: comparable2 * * Return: ... * cmp1 < cmp2: return <0 * cmp1 == cmp2: return 0 * cmp1 > cmp2: return >0 */ static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) { if (cmp1 < cmp2) return -1; if (cmp1 > cmp2) return 1; return 0; } /** * ktime_after - Compare if a ktime_t value is bigger than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened after cmp2. */ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) > 0; } /** * ktime_before - Compare if a ktime_t value is smaller than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened before cmp2. */ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) < 0; } #if BITS_PER_LONG < 64 extern s64 __ktime_divns(const ktime_t kt, s64 div); static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * Negative divisors could cause an inf loop, * so bug out here. */ BUG_ON(div < 0); if (__builtin_constant_p(div) && !(div >> 32)) { s64 ns = kt; u64 tmp = ns < 0 ? -ns : ns; do_div(tmp, div); return ns < 0 ? -tmp : tmp; } else { return __ktime_divns(kt, div); } } #else /* BITS_PER_LONG < 64 */ static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * 32-bit implementation cannot handle negative divisors, * so catch them on 64bit as well. */ WARN_ON(div < 0); return kt / div; } #endif static inline s64 ktime_to_us(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_USEC); } static inline s64 ktime_to_ms(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_MSEC); } static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_us(ktime_sub(later, earlier)); } static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_ms(ktime_sub(later, earlier)); } static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec) { return ktime_add_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec) { return ktime_add_ns(kt, msec * NSEC_PER_MSEC); } static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) { return ktime_sub_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) { return ktime_sub_ns(kt, msec * NSEC_PER_MSEC); } extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); /** * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64 * format only if the variable contains data * @kt: the ktime_t variable to convert * @ts: the timespec variable to store the result in * * Return: %true if there was a successful conversion, %false if kt was 0. */ static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt, struct timespec64 *ts) { if (kt) { *ts = ktime_to_timespec64(kt); return true; } else { return false; } } #include <vdso/ktime.h> static inline ktime_t ns_to_ktime(u64 ns) { return ns; } static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; } # include <linux/timekeeping.h> #endif |
1 1 1 1 7 3 1 1 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 | // SPDX-License-Identifier: GPL-2.0-or-later /* * af_alg: User-space algorithm interface * * This file provides the user-space API for algorithms. * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/atomic.h> #include <crypto/if_alg.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/key.h> #include <linux/key-type.h> #include <linux/list.h> #include <linux/module.h> #include <linux/net.h> #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/security.h> #include <linux/string.h> #include <keys/user-type.h> #include <keys/trusted-type.h> #include <keys/encrypted-type.h> struct alg_type_list { const struct af_alg_type *type; struct list_head list; }; static struct proto alg_proto = { .name = "ALG", .owner = THIS_MODULE, .obj_size = sizeof(struct alg_sock), }; static LIST_HEAD(alg_types); static DECLARE_RWSEM(alg_types_sem); static const struct af_alg_type *alg_get_type(const char *name) { const struct af_alg_type *type = ERR_PTR(-ENOENT); struct alg_type_list *node; down_read(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (strcmp(node->type->name, name)) continue; if (try_module_get(node->type->owner)) type = node->type; break; } up_read(&alg_types_sem); return type; } int af_alg_register_type(const struct af_alg_type *type) { struct alg_type_list *node; int err = -EEXIST; down_write(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (!strcmp(node->type->name, type->name)) goto unlock; } node = kmalloc(sizeof(*node), GFP_KERNEL); err = -ENOMEM; if (!node) goto unlock; type->ops->owner = THIS_MODULE; if (type->ops_nokey) type->ops_nokey->owner = THIS_MODULE; node->type = type; list_add(&node->list, &alg_types); err = 0; unlock: up_write(&alg_types_sem); return err; } EXPORT_SYMBOL_GPL(af_alg_register_type); int af_alg_unregister_type(const struct af_alg_type *type) { struct alg_type_list *node; int err = -ENOENT; down_write(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (strcmp(node->type->name, type->name)) continue; list_del(&node->list); kfree(node); err = 0; break; } up_write(&alg_types_sem); return err; } EXPORT_SYMBOL_GPL(af_alg_unregister_type); static void alg_do_release(const struct af_alg_type *type, void *private) { if (!type) return; type->release(private); module_put(type->owner); } int af_alg_release(struct socket *sock) { if (sock->sk) { sock_put(sock->sk); sock->sk = NULL; } return 0; } EXPORT_SYMBOL_GPL(af_alg_release); void af_alg_release_parent(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); unsigned int nokey = atomic_read(&ask->nokey_refcnt); sk = ask->parent; ask = alg_sk(sk); if (nokey) atomic_dec(&ask->nokey_refcnt); if (atomic_dec_and_test(&ask->refcnt)) sock_put(sk); } EXPORT_SYMBOL_GPL(af_alg_release_parent); static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY; struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct sockaddr_alg_new *sa = (void *)uaddr; const struct af_alg_type *type; void *private; int err; if (sock->state == SS_CONNECTED) return -EINVAL; BUILD_BUG_ON(offsetof(struct sockaddr_alg_new, salg_name) != offsetof(struct sockaddr_alg, salg_name)); BUILD_BUG_ON(offsetof(struct sockaddr_alg, salg_name) != sizeof(*sa)); if (addr_len < sizeof(*sa) + 1) return -EINVAL; /* If caller uses non-allowed flag, return error. */ if ((sa->salg_feat & ~allowed) || (sa->salg_mask & ~allowed)) return -EINVAL; sa->salg_type[sizeof(sa->salg_type) - 1] = 0; sa->salg_name[addr_len - sizeof(*sa) - 1] = 0; type = alg_get_type(sa->salg_type); if (PTR_ERR(type) == -ENOENT) { request_module("algif-%s", sa->salg_type); type = alg_get_type(sa->salg_type); } if (IS_ERR(type)) return PTR_ERR(type); private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask); if (IS_ERR(private)) { module_put(type->owner); return PTR_ERR(private); } err = -EBUSY; lock_sock(sk); if (atomic_read(&ask->refcnt)) goto unlock; swap(ask->type, type); swap(ask->private, private); err = 0; unlock: release_sock(sk); alg_do_release(type, private); return err; } static int alg_setkey(struct sock *sk, sockptr_t ukey, unsigned int keylen) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type = ask->type; u8 *key; int err; key = sock_kmalloc(sk, keylen, GFP_KERNEL); if (!key) return -ENOMEM; err = -EFAULT; if (copy_from_sockptr(key, ukey, keylen)) goto out; err = type->setkey(ask->private, key, keylen); out: sock_kzfree_s(sk, key, keylen); return err; } #ifdef CONFIG_KEYS static const u8 *key_data_ptr_user(const struct key *key, unsigned int *datalen) { const struct user_key_payload *ukp; ukp = user_key_payload_locked(key); if (IS_ERR_OR_NULL(ukp)) return ERR_PTR(-EKEYREVOKED); *datalen = key->datalen; return ukp->data; } static const u8 *key_data_ptr_encrypted(const struct key *key, unsigned int *datalen) { const struct encrypted_key_payload *ekp; ekp = dereference_key_locked(key); if (IS_ERR_OR_NULL(ekp)) return ERR_PTR(-EKEYREVOKED); *datalen = ekp->decrypted_datalen; return ekp->decrypted_data; } static const u8 *key_data_ptr_trusted(const struct key *key, unsigned int *datalen) { const struct trusted_key_payload *tkp; tkp = dereference_key_locked(key); if (IS_ERR_OR_NULL(tkp)) return ERR_PTR(-EKEYREVOKED); *datalen = tkp->key_len; return tkp->key; } static struct key *lookup_key(key_serial_t serial) { key_ref_t key_ref; key_ref = lookup_user_key(serial, 0, KEY_NEED_SEARCH); if (IS_ERR(key_ref)) return ERR_CAST(key_ref); return key_ref_to_ptr(key_ref); } static int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval, unsigned int optlen) { const struct af_alg_type *type = ask->type; u8 *key_data = NULL; unsigned int key_datalen; key_serial_t serial; struct key *key; const u8 *ret; int err; if (optlen != sizeof(serial)) return -EINVAL; if (copy_from_sockptr(&serial, optval, optlen)) return -EFAULT; key = lookup_key(serial); if (IS_ERR(key)) return PTR_ERR(key); down_read(&key->sem); ret = ERR_PTR(-ENOPROTOOPT); if (!strcmp(key->type->name, "user") || !strcmp(key->type->name, "logon")) { ret = key_data_ptr_user(key, &key_datalen); } else if (IS_REACHABLE(CONFIG_ENCRYPTED_KEYS) && !strcmp(key->type->name, "encrypted")) { ret = key_data_ptr_encrypted(key, &key_datalen); } else if (IS_REACHABLE(CONFIG_TRUSTED_KEYS) && !strcmp(key->type->name, "trusted")) { ret = key_data_ptr_trusted(key, &key_datalen); } if (IS_ERR(ret)) { up_read(&key->sem); key_put(key); return PTR_ERR(ret); } key_data = sock_kmalloc(&ask->sk, key_datalen, GFP_KERNEL); if (!key_data) { up_read(&key->sem); key_put(key); return -ENOMEM; } memcpy(key_data, ret, key_datalen); up_read(&key->sem); key_put(key); err = type->setkey(ask->private, key_data, key_datalen); sock_kzfree_s(&ask->sk, key_data, key_datalen); return err; } #else static inline int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } #endif static int alg_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; int err = -EBUSY; lock_sock(sk); if (atomic_read(&ask->refcnt) != atomic_read(&ask->nokey_refcnt)) goto unlock; type = ask->type; err = -ENOPROTOOPT; if (level != SOL_ALG || !type) goto unlock; switch (optname) { case ALG_SET_KEY: case ALG_SET_KEY_BY_KEY_SERIAL: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setkey) goto unlock; if (optname == ALG_SET_KEY_BY_KEY_SERIAL) err = alg_setkey_by_key_serial(ask, optval, optlen); else err = alg_setkey(sk, optval, optlen); break; case ALG_SET_AEAD_AUTHSIZE: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setauthsize) goto unlock; err = type->setauthsize(ask->private, optlen); break; case ALG_SET_DRBG_ENTROPY: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setentropy) goto unlock; err = type->setentropy(ask->private, optval, optlen); } unlock: release_sock(sk); return err; } int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; struct sock *sk2; unsigned int nokey; int err; lock_sock(sk); type = ask->type; err = -EINVAL; if (!type) goto unlock; sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, kern); err = -ENOMEM; if (!sk2) goto unlock; sock_init_data(newsock, sk2); security_sock_graft(sk2, newsock); security_sk_clone(sk, sk2); /* * newsock->ops assigned here to allow type->accept call to override * them when required. */ newsock->ops = type->ops; err = type->accept(ask->private, sk2); nokey = err == -ENOKEY; if (nokey && type->accept_nokey) err = type->accept_nokey(ask->private, sk2); if (err) goto unlock; if (atomic_inc_return_relaxed(&ask->refcnt) == 1) sock_hold(sk); if (nokey) { atomic_inc(&ask->nokey_refcnt); atomic_set(&alg_sk(sk2)->nokey_refcnt, 1); } alg_sk(sk2)->parent = sk; alg_sk(sk2)->type = type; newsock->state = SS_CONNECTED; if (nokey) newsock->ops = type->ops_nokey; err = 0; unlock: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(af_alg_accept); static int alg_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { return af_alg_accept(sock->sk, newsock, kern); } static const struct proto_ops alg_proto_ops = { .family = PF_ALG, .owner = THIS_MODULE, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .bind = alg_bind, .release = af_alg_release, .setsockopt = alg_setsockopt, .accept = alg_accept, }; static void alg_sock_destruct(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); alg_do_release(ask->type, ask->private); } static int alg_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int err; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; if (protocol != 0) return -EPROTONOSUPPORT; err = -ENOMEM; sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern); if (!sk) goto out; sock->ops = &alg_proto_ops; sock_init_data(sock, sk); sk->sk_destruct = alg_sock_destruct; return 0; out: return err; } static const struct net_proto_family alg_family = { .family = PF_ALG, .create = alg_create, .owner = THIS_MODULE, }; static void af_alg_link_sg(struct af_alg_sgl *sgl_prev, struct af_alg_sgl *sgl_new) { sg_unmark_end(sgl_prev->sgt.sgl + sgl_prev->sgt.nents - 1); sg_chain(sgl_prev->sgt.sgl, sgl_prev->sgt.nents + 1, sgl_new->sgt.sgl); } void af_alg_free_sg(struct af_alg_sgl *sgl) { int i; if (sgl->sgt.sgl) { if (sgl->need_unpin) for (i = 0; i < sgl->sgt.nents; i++) unpin_user_page(sg_page(&sgl->sgt.sgl[i])); if (sgl->sgt.sgl != sgl->sgl) kvfree(sgl->sgt.sgl); sgl->sgt.sgl = NULL; } } EXPORT_SYMBOL_GPL(af_alg_free_sg); static int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con) { struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_ALG) continue; switch (cmsg->cmsg_type) { case ALG_SET_IV: if (cmsg->cmsg_len < CMSG_LEN(sizeof(*con->iv))) return -EINVAL; con->iv = (void *)CMSG_DATA(cmsg); if (cmsg->cmsg_len < CMSG_LEN(con->iv->ivlen + sizeof(*con->iv))) return -EINVAL; break; case ALG_SET_OP: if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32))) return -EINVAL; con->op = *(u32 *)CMSG_DATA(cmsg); break; case ALG_SET_AEAD_ASSOCLEN: if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32))) return -EINVAL; con->aead_assoclen = *(u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } /** * af_alg_alloc_tsgl - allocate the TX SGL * * @sk: socket of connection to user space * Return: 0 upon success, < 0 upon error */ static int af_alg_alloc_tsgl(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg = NULL; sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); if (!list_empty(&ctx->tsgl_list)) sg = sgl->sg; if (!sg || sgl->cur >= MAX_SGL_ENTS) { sgl = sock_kmalloc(sk, struct_size(sgl, sg, (MAX_SGL_ENTS + 1)), GFP_KERNEL); if (!sgl) return -ENOMEM; sg_init_table(sgl->sg, MAX_SGL_ENTS + 1); sgl->cur = 0; if (sg) sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg); list_add_tail(&sgl->list, &ctx->tsgl_list); } return 0; } /** * af_alg_count_tsgl - Count number of TX SG entries * * The counting starts from the beginning of the SGL to @bytes. If * an @offset is provided, the counting of the SG entries starts at the @offset. * * @sk: socket of connection to user space * @bytes: Count the number of SG entries holding given number of bytes. * @offset: Start the counting of SG entries from the given offset. * Return: Number of TX SG entries found given the constraints */ unsigned int af_alg_count_tsgl(struct sock *sk, size_t bytes, size_t offset) { const struct alg_sock *ask = alg_sk(sk); const struct af_alg_ctx *ctx = ask->private; const struct af_alg_tsgl *sgl; unsigned int i; unsigned int sgl_count = 0; if (!bytes) return 0; list_for_each_entry(sgl, &ctx->tsgl_list, list) { const struct scatterlist *sg = sgl->sg; for (i = 0; i < sgl->cur; i++) { size_t bytes_count; /* Skip offset */ if (offset >= sg[i].length) { offset -= sg[i].length; bytes -= sg[i].length; continue; } bytes_count = sg[i].length - offset; offset = 0; sgl_count++; /* If we have seen requested number of bytes, stop */ if (bytes_count >= bytes) return sgl_count; bytes -= bytes_count; } } return sgl_count; } EXPORT_SYMBOL_GPL(af_alg_count_tsgl); /** * af_alg_pull_tsgl - Release the specified buffers from TX SGL * * If @dst is non-null, reassign the pages to @dst. The caller must release * the pages. If @dst_offset is given only reassign the pages to @dst starting * at the @dst_offset (byte). The caller must ensure that @dst is large * enough (e.g. by using af_alg_count_tsgl with the same offset). * * @sk: socket of connection to user space * @used: Number of bytes to pull from TX SGL * @dst: If non-NULL, buffer is reassigned to dst SGL instead of releasing. The * caller must release the buffers in dst. * @dst_offset: Reassign the TX SGL from given offset. All buffers before * reaching the offset is released. */ void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst, size_t dst_offset) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg; unsigned int i, j = 0; while (!list_empty(&ctx->tsgl_list)) { sgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl, list); sg = sgl->sg; for (i = 0; i < sgl->cur; i++) { size_t plen = min_t(size_t, used, sg[i].length); struct page *page = sg_page(sg + i); if (!page) continue; /* * Assumption: caller created af_alg_count_tsgl(len) * SG entries in dst. */ if (dst) { if (dst_offset >= plen) { /* discard page before offset */ dst_offset -= plen; } else { /* reassign page to dst after offset */ get_page(page); sg_set_page(dst + j, page, plen - dst_offset, sg[i].offset + dst_offset); dst_offset = 0; j++; } } sg[i].length -= plen; sg[i].offset += plen; used -= plen; ctx->used -= plen; if (sg[i].length) return; put_page(page); sg_assign_page(sg + i, NULL); } list_del(&sgl->list); sock_kfree_s(sk, sgl, struct_size(sgl, sg, MAX_SGL_ENTS + 1)); } if (!ctx->used) ctx->merge = 0; ctx->init = ctx->more; } EXPORT_SYMBOL_GPL(af_alg_pull_tsgl); /** * af_alg_free_areq_sgls - Release TX and RX SGLs of the request * * @areq: Request holding the TX and RX SGL */ static void af_alg_free_areq_sgls(struct af_alg_async_req *areq) { struct sock *sk = areq->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_rsgl *rsgl, *tmp; struct scatterlist *tsgl; struct scatterlist *sg; unsigned int i; list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) { atomic_sub(rsgl->sg_num_bytes, &ctx->rcvused); af_alg_free_sg(&rsgl->sgl); list_del(&rsgl->list); if (rsgl != &areq->first_rsgl) sock_kfree_s(sk, rsgl, sizeof(*rsgl)); } tsgl = areq->tsgl; if (tsgl) { for_each_sg(tsgl, sg, areq->tsgl_entries, i) { if (!sg_page(sg)) continue; put_page(sg_page(sg)); } sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl)); } } /** * af_alg_wait_for_wmem - wait for availability of writable memory * * @sk: socket of connection to user space * @flags: If MSG_DONTWAIT is set, then only report if function would sleep * Return: 0 when writable memory is available, < 0 upon error */ static int af_alg_wait_for_wmem(struct sock *sk, unsigned int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int err = -ERESTARTSYS; long timeout; if (flags & MSG_DONTWAIT) return -EAGAIN; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); add_wait_queue(sk_sleep(sk), &wait); for (;;) { if (signal_pending(current)) break; timeout = MAX_SCHEDULE_TIMEOUT; if (sk_wait_event(sk, &timeout, af_alg_writable(sk), &wait)) { err = 0; break; } } remove_wait_queue(sk_sleep(sk), &wait); return err; } /** * af_alg_wmem_wakeup - wakeup caller when writable memory is available * * @sk: socket of connection to user space */ void af_alg_wmem_wakeup(struct sock *sk) { struct socket_wq *wq; if (!af_alg_writable(sk)) return; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); /** * af_alg_wait_for_data - wait for availability of TX data * * @sk: socket of connection to user space * @flags: If MSG_DONTWAIT is set, then only report if function would sleep * @min: Set to minimum request size if partial requests are allowed. * Return: 0 when writable memory is available, < 0 upon error */ int af_alg_wait_for_data(struct sock *sk, unsigned flags, unsigned min) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; long timeout; int err = -ERESTARTSYS; if (flags & MSG_DONTWAIT) return -EAGAIN; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); for (;;) { if (signal_pending(current)) break; timeout = MAX_SCHEDULE_TIMEOUT; if (sk_wait_event(sk, &timeout, ctx->init && (!ctx->more || (min && ctx->used >= min)), &wait)) { err = 0; break; } } remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); return err; } EXPORT_SYMBOL_GPL(af_alg_wait_for_data); /** * af_alg_data_wakeup - wakeup caller when new data can be sent to kernel * * @sk: socket of connection to user space */ static void af_alg_data_wakeup(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct socket_wq *wq; if (!ctx->used) return; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } /** * af_alg_sendmsg - implementation of sendmsg system call handler * * The sendmsg system call handler obtains the user data and stores it * in ctx->tsgl_list. This implies allocation of the required numbers of * struct af_alg_tsgl. * * In addition, the ctx is filled with the information sent via CMSG. * * @sock: socket of connection to user space * @msg: message from user space * @size: size of message from user space * @ivsize: the size of the IV for the cipher operation to verify that the * user-space-provided IV has the right size * Return: the number of copied data upon success, < 0 upon error */ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unsigned int ivsize) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct af_alg_control con = {}; long copied = 0; bool enc = false; bool init = false; int err = 0; if (msg->msg_controllen) { err = af_alg_cmsg_send(msg, &con); if (err) return err; init = true; switch (con.op) { case ALG_OP_ENCRYPT: enc = true; break; case ALG_OP_DECRYPT: enc = false; break; default: return -EINVAL; } if (con.iv && con.iv->ivlen != ivsize) return -EINVAL; } lock_sock(sk); if (ctx->init && !ctx->more) { if (ctx->used) { err = -EINVAL; goto unlock; } pr_info_once( "%s sent an empty control message without MSG_MORE.\n", current->comm); } ctx->init = true; if (init) { ctx->enc = enc; if (con.iv) memcpy(ctx->iv, con.iv->iv, ivsize); ctx->aead_assoclen = con.aead_assoclen; } while (size) { struct scatterlist *sg; size_t len = size; ssize_t plen; /* use the existing memory in an allocated page */ if (ctx->merge && !(msg->msg_flags & MSG_SPLICE_PAGES)) { sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); sg = sgl->sg + sgl->cur - 1; len = min_t(size_t, len, PAGE_SIZE - sg->offset - sg->length); err = memcpy_from_msg(page_address(sg_page(sg)) + sg->offset + sg->length, msg, len); if (err) goto unlock; sg->length += len; ctx->merge = (sg->offset + sg->length) & (PAGE_SIZE - 1); ctx->used += len; copied += len; size -= len; continue; } if (!af_alg_writable(sk)) { err = af_alg_wait_for_wmem(sk, msg->msg_flags); if (err) goto unlock; } /* allocate a new page */ len = min_t(unsigned long, len, af_alg_sndbuf(sk)); err = af_alg_alloc_tsgl(sk); if (err) goto unlock; sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); sg = sgl->sg; if (sgl->cur) sg_unmark_end(sg + sgl->cur - 1); if (msg->msg_flags & MSG_SPLICE_PAGES) { struct sg_table sgtable = { .sgl = sg, .nents = sgl->cur, .orig_nents = sgl->cur, }; plen = extract_iter_to_sg(&msg->msg_iter, len, &sgtable, MAX_SGL_ENTS - sgl->cur, 0); if (plen < 0) { err = plen; goto unlock; } for (; sgl->cur < sgtable.nents; sgl->cur++) get_page(sg_page(&sg[sgl->cur])); len -= plen; ctx->used += plen; copied += plen; size -= plen; ctx->merge = 0; } else { do { struct page *pg; unsigned int i = sgl->cur; plen = min_t(size_t, len, PAGE_SIZE); pg = alloc_page(GFP_KERNEL); if (!pg) { err = -ENOMEM; goto unlock; } sg_assign_page(sg + i, pg); err = memcpy_from_msg( page_address(sg_page(sg + i)), msg, plen); if (err) { __free_page(sg_page(sg + i)); sg_assign_page(sg + i, NULL); goto unlock; } sg[i].length = plen; len -= plen; ctx->used += plen; copied += plen; size -= plen; sgl->cur++; } while (len && sgl->cur < MAX_SGL_ENTS); ctx->merge = plen & (PAGE_SIZE - 1); } if (!size) sg_mark_end(sg + sgl->cur - 1); } err = 0; ctx->more = msg->msg_flags & MSG_MORE; unlock: af_alg_data_wakeup(sk); release_sock(sk); return copied ?: err; } EXPORT_SYMBOL_GPL(af_alg_sendmsg); /** * af_alg_free_resources - release resources required for crypto request * @areq: Request holding the TX and RX SGL */ void af_alg_free_resources(struct af_alg_async_req *areq) { struct sock *sk = areq->sk; struct af_alg_ctx *ctx; af_alg_free_areq_sgls(areq); sock_kfree_s(sk, areq, areq->areqlen); ctx = alg_sk(sk)->private; ctx->inflight = false; } EXPORT_SYMBOL_GPL(af_alg_free_resources); /** * af_alg_async_cb - AIO callback handler * @data: async request completion data * @err: if non-zero, error result to be returned via ki_complete(); * otherwise return the AIO output length via ki_complete(). * * This handler cleans up the struct af_alg_async_req upon completion of the * AIO operation. * * The number of bytes to be generated with the AIO operation must be set * in areq->outlen before the AIO callback handler is invoked. */ void af_alg_async_cb(void *data, int err) { struct af_alg_async_req *areq = data; struct sock *sk = areq->sk; struct kiocb *iocb = areq->iocb; unsigned int resultlen; /* Buffer size written by crypto operation. */ resultlen = areq->outlen; af_alg_free_resources(areq); sock_put(sk); iocb->ki_complete(iocb, err ? err : (int)resultlen); } EXPORT_SYMBOL_GPL(af_alg_async_cb); /** * af_alg_poll - poll system call handler * @file: file pointer * @sock: socket to poll * @wait: poll_table */ __poll_t af_alg_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; __poll_t mask; sock_poll_wait(file, sock, wait); mask = 0; if (!ctx->more || ctx->used) mask |= EPOLLIN | EPOLLRDNORM; if (af_alg_writable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } EXPORT_SYMBOL_GPL(af_alg_poll); /** * af_alg_alloc_areq - allocate struct af_alg_async_req * * @sk: socket of connection to user space * @areqlen: size of struct af_alg_async_req + crypto_*_reqsize * Return: allocated data structure or ERR_PTR upon error */ struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen) { struct af_alg_ctx *ctx = alg_sk(sk)->private; struct af_alg_async_req *areq; /* Only one AIO request can be in flight. */ if (ctx->inflight) return ERR_PTR(-EBUSY); areq = sock_kmalloc(sk, areqlen, GFP_KERNEL); if (unlikely(!areq)) return ERR_PTR(-ENOMEM); ctx->inflight = true; areq->areqlen = areqlen; areq->sk = sk; areq->first_rsgl.sgl.sgt.sgl = areq->first_rsgl.sgl.sgl; areq->last_rsgl = NULL; INIT_LIST_HEAD(&areq->rsgl_list); areq->tsgl = NULL; areq->tsgl_entries = 0; return areq; } EXPORT_SYMBOL_GPL(af_alg_alloc_areq); /** * af_alg_get_rsgl - create the RX SGL for the output data from the crypto * operation * * @sk: socket of connection to user space * @msg: user space message * @flags: flags used to invoke recvmsg with * @areq: instance of the cryptographic request that will hold the RX SGL * @maxsize: maximum number of bytes to be pulled from user space * @outlen: number of bytes in the RX SGL * Return: 0 on success, < 0 upon error */ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, struct af_alg_async_req *areq, size_t maxsize, size_t *outlen) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; size_t len = 0; while (maxsize > len && msg_data_left(msg)) { struct af_alg_rsgl *rsgl; ssize_t err; size_t seglen; /* limit the amount of readable buffers */ if (!af_alg_readable(sk)) break; seglen = min_t(size_t, (maxsize - len), msg_data_left(msg)); if (list_empty(&areq->rsgl_list)) { rsgl = &areq->first_rsgl; } else { rsgl = sock_kmalloc(sk, sizeof(*rsgl), GFP_KERNEL); if (unlikely(!rsgl)) return -ENOMEM; } rsgl->sgl.need_unpin = iov_iter_extract_will_pin(&msg->msg_iter); rsgl->sgl.sgt.sgl = rsgl->sgl.sgl; rsgl->sgl.sgt.nents = 0; rsgl->sgl.sgt.orig_nents = 0; list_add_tail(&rsgl->list, &areq->rsgl_list); sg_init_table(rsgl->sgl.sgt.sgl, ALG_MAX_PAGES); err = extract_iter_to_sg(&msg->msg_iter, seglen, &rsgl->sgl.sgt, ALG_MAX_PAGES, 0); if (err < 0) { rsgl->sg_num_bytes = 0; return err; } sg_mark_end(rsgl->sgl.sgt.sgl + rsgl->sgl.sgt.nents - 1); /* chain the new scatterlist with previous one */ if (areq->last_rsgl) af_alg_link_sg(&areq->last_rsgl->sgl, &rsgl->sgl); areq->last_rsgl = rsgl; len += err; atomic_add(err, &ctx->rcvused); rsgl->sg_num_bytes = err; } *outlen = len; return 0; } EXPORT_SYMBOL_GPL(af_alg_get_rsgl); static int __init af_alg_init(void) { int err = proto_register(&alg_proto, 0); if (err) goto out; err = sock_register(&alg_family); if (err != 0) goto out_unregister_proto; out: return err; out_unregister_proto: proto_unregister(&alg_proto); goto out; } static void __exit af_alg_exit(void) { sock_unregister(PF_ALG); proto_unregister(&alg_proto); } module_init(af_alg_init); module_exit(af_alg_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_ALG); |
5 6 6 6 7 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | /* * net/tipc/diag.c: TIPC socket diag * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "socket.h" #include <linux/sock_diag.h> #include <linux/tipc_sockets_diag.h> static u64 __tipc_diag_gen_cookie(struct sock *sk) { u32 res[2]; sock_diag_save_cookie(sk, res); return *((u64 *)res); } static int __tipc_add_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); struct nlmsghdr *nlh; int err; nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, NLM_F_MULTI); if (!nlh) return -EMSGSIZE; err = tipc_sk_fill_sock_diag(skb, cb, tsk, req->tidiag_states, __tipc_diag_gen_cookie); if (err) return err; nlmsg_end(skb, nlh); return 0; } static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); } static int tipc_sock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct tipc_sock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = tipc_dump_start, .dump = tipc_diag_dump, .done = tipc_dump_done, }; netlink_dump_start(net->diag_nlsk, skb, h, &c); return 0; } return -EOPNOTSUPP; } static const struct sock_diag_handler tipc_sock_diag_handler = { .family = AF_TIPC, .dump = tipc_sock_diag_handler_dump, }; static int __init tipc_diag_init(void) { return sock_diag_register(&tipc_sock_diag_handler); } static void __exit tipc_diag_exit(void) { sock_diag_unregister(&tipc_sock_diag_handler); } module_init(tipc_diag_init); module_exit(tipc_diag_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("TIPC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC); |
2922 115 2816 2922 2914 2911 2911 2911 2910 2911 2921 2922 2920 2922 2923 2922 2921 2921 2921 2921 2911 2911 2911 2910 2922 2921 2922 2922 2921 2922 2921 5 5 5 5 5 21 2923 2923 2921 2921 2911 2910 2911 2910 2911 2906 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1999 Eric Youngdale * Copyright (C) 2014 Christoph Hellwig * * SCSI queueing library. * Initial versions: Eric Youngdale (eric@andante.org). * Based upon conversations with large numbers * of people at Linux Expo. */ #include <linux/bio.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/completion.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/delay.h> #include <linux/hardirq.h> #include <linux/scatterlist.h> #include <linux/blk-mq.h> #include <linux/blk-integrity.h> #include <linux/ratelimit.h> #include <asm/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_transport.h> /* __scsi_init_queue() */ #include <scsi/scsi_dh.h> #include <trace/events/scsi.h> #include "scsi_debugfs.h" #include "scsi_priv.h" #include "scsi_logging.h" /* * Size of integrity metadata is usually small, 1 inline sg should * cover normal cases. */ #ifdef CONFIG_ARCH_NO_SG_CHAIN #define SCSI_INLINE_PROT_SG_CNT 0 #define SCSI_INLINE_SG_CNT 0 #else #define SCSI_INLINE_PROT_SG_CNT 1 #define SCSI_INLINE_SG_CNT 2 #endif static struct kmem_cache *scsi_sense_cache; static DEFINE_MUTEX(scsi_sense_cache_mutex); static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd); int scsi_init_sense_cache(struct Scsi_Host *shost) { int ret = 0; mutex_lock(&scsi_sense_cache_mutex); if (!scsi_sense_cache) { scsi_sense_cache = kmem_cache_create_usercopy("scsi_sense_cache", SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN, 0, SCSI_SENSE_BUFFERSIZE, NULL); if (!scsi_sense_cache) ret = -ENOMEM; } mutex_unlock(&scsi_sense_cache_mutex); return ret; } static void scsi_set_blocked(struct scsi_cmnd *cmd, int reason) { struct Scsi_Host *host = cmd->device->host; struct scsi_device *device = cmd->device; struct scsi_target *starget = scsi_target(device); /* * Set the appropriate busy bit for the device/host. * * If the host/device isn't busy, assume that something actually * completed, and that we should be able to queue a command now. * * Note that the prior mid-layer assumption that any host could * always queue at least one command is now broken. The mid-layer * will implement a user specifiable stall (see * scsi_host.max_host_blocked and scsi_device.max_device_blocked) * if a command is requeued with no other commands outstanding * either for the device or for the host. */ switch (reason) { case SCSI_MLQUEUE_HOST_BUSY: atomic_set(&host->host_blocked, host->max_host_blocked); break; case SCSI_MLQUEUE_DEVICE_BUSY: case SCSI_MLQUEUE_EH_RETRY: atomic_set(&device->device_blocked, device->max_device_blocked); break; case SCSI_MLQUEUE_TARGET_BUSY: atomic_set(&starget->target_blocked, starget->max_target_blocked); break; } } static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd, unsigned long msecs) { struct request *rq = scsi_cmd_to_rq(cmd); if (rq->rq_flags & RQF_DONTPREP) { rq->rq_flags &= ~RQF_DONTPREP; scsi_mq_uninit_cmd(cmd); } else { WARN_ON_ONCE(true); } blk_mq_requeue_request(rq, false); if (!scsi_host_in_recovery(cmd->device->host)) blk_mq_delay_kick_requeue_list(rq->q, msecs); } /** * __scsi_queue_insert - private queue insertion * @cmd: The SCSI command being requeued * @reason: The reason for the requeue * @unbusy: Whether the queue should be unbusied * * This is a private queue insertion. The public interface * scsi_queue_insert() always assumes the queue should be unbusied * because it's always called before the completion. This function is * for a requeue after completion, which should only occur in this * file. */ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) { struct scsi_device *device = cmd->device; SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd, "Inserting command %p into mlqueue\n", cmd)); scsi_set_blocked(cmd, reason); /* * Decrement the counters, since these commands are no longer * active on the host/device. */ if (unbusy) scsi_device_unbusy(device, cmd); /* * Requeue this command. It will go before all other commands * that are already in the queue. Schedule requeue work under * lock such that the kblockd_schedule_work() call happens * before blk_mq_destroy_queue() finishes. */ cmd->result = 0; blk_mq_requeue_request(scsi_cmd_to_rq(cmd), !scsi_host_in_recovery(cmd->device->host)); } /** * scsi_queue_insert - Reinsert a command in the queue. * @cmd: command that we are adding to queue. * @reason: why we are inserting command to queue. * * We do this for one of two cases. Either the host is busy and it cannot accept * any more commands for the time being, or the device returned QUEUE_FULL and * can accept no more commands. * * Context: This could be called either from an interrupt context or a normal * process context. */ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) { __scsi_queue_insert(cmd, reason, true); } /** * scsi_execute_cmd - insert request and wait for the result * @sdev: scsi_device * @cmd: scsi command * @opf: block layer request cmd_flags * @buffer: data buffer * @bufflen: len of buffer * @timeout: request timeout in HZ * @retries: number of times to retry request * @args: Optional args. See struct definition for field descriptions * * Returns the scsi_cmnd result field if a command was executed, or a negative * Linux error code if we didn't get that far. */ int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, blk_opf_t opf, void *buffer, unsigned int bufflen, int timeout, int retries, const struct scsi_exec_args *args) { static const struct scsi_exec_args default_args; struct request *req; struct scsi_cmnd *scmd; int ret; if (!args) args = &default_args; else if (WARN_ON_ONCE(args->sense && args->sense_len != SCSI_SENSE_BUFFERSIZE)) return -EINVAL; req = scsi_alloc_request(sdev->request_queue, opf, args->req_flags); if (IS_ERR(req)) return PTR_ERR(req); if (bufflen) { ret = blk_rq_map_kern(sdev->request_queue, req, buffer, bufflen, GFP_NOIO); if (ret) goto out; } scmd = blk_mq_rq_to_pdu(req); scmd->cmd_len = COMMAND_SIZE(cmd[0]); memcpy(scmd->cmnd, cmd, scmd->cmd_len); scmd->allowed = retries; scmd->flags |= args->scmd_flags; req->timeout = timeout; req->rq_flags |= RQF_QUIET; /* * head injection *required* here otherwise quiesce won't work */ blk_execute_rq(req, true); /* * Some devices (USB mass-storage in particular) may transfer * garbage data together with a residue indicating that the data * is invalid. Prevent the garbage from being misinterpreted * and prevent security leaks by zeroing out the excess data. */ if (unlikely(scmd->resid_len > 0 && scmd->resid_len <= bufflen)) memset(buffer + bufflen - scmd->resid_len, 0, scmd->resid_len); if (args->resid) *args->resid = scmd->resid_len; if (args->sense) memcpy(args->sense, scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE); if (args->sshdr) scsi_normalize_sense(scmd->sense_buffer, scmd->sense_len, args->sshdr); ret = scmd->result; out: blk_mq_free_request(req); return ret; } EXPORT_SYMBOL(scsi_execute_cmd); /* * Wake up the error handler if necessary. Avoid as follows that the error * handler is not woken up if host in-flight requests number == * shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination * with an RCU read lock in this function to ensure that this function in * its entirety either finishes before scsi_eh_scmd_add() increases the * host_failed counter or that it notices the shost state change made by * scsi_eh_scmd_add(). */ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd) { unsigned long flags; rcu_read_lock(); __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state); if (unlikely(scsi_host_in_recovery(shost))) { spin_lock_irqsave(shost->host_lock, flags); if (shost->host_failed || shost->host_eh_scheduled) scsi_eh_wakeup(shost); spin_unlock_irqrestore(shost->host_lock, flags); } rcu_read_unlock(); } void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd) { struct Scsi_Host *shost = sdev->host; struct scsi_target *starget = scsi_target(sdev); scsi_dec_host_busy(shost, cmd); if (starget->can_queue > 0) atomic_dec(&starget->target_busy); sbitmap_put(&sdev->budget_map, cmd->budget_token); cmd->budget_token = -1; } /* * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with * interrupts disabled. */ static void scsi_kick_sdev_queue(struct scsi_device *sdev, void *data) { struct scsi_device *current_sdev = data; if (sdev != current_sdev) blk_mq_run_hw_queues(sdev->request_queue, true); } /* * Called for single_lun devices on IO completion. Clear starget_sdev_user, * and call blk_run_queue for all the scsi_devices on the target - * including current_sdev first. * * Called with *no* scsi locks held. */ static void scsi_single_lun_run(struct scsi_device *current_sdev) { struct Scsi_Host *shost = current_sdev->host; struct scsi_target *starget = scsi_target(current_sdev); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); starget->starget_sdev_user = NULL; spin_unlock_irqrestore(shost->host_lock, flags); /* * Call blk_run_queue for all LUNs on the target, starting with * current_sdev. We race with others (to set starget_sdev_user), * but in most cases, we will be first. Ideally, each LU on the * target would get some limited time or requests on the target. */ blk_mq_run_hw_queues(current_sdev->request_queue, shost->queuecommand_may_block); spin_lock_irqsave(shost->host_lock, flags); if (!starget->starget_sdev_user) __starget_for_each_device(starget, current_sdev, scsi_kick_sdev_queue); spin_unlock_irqrestore(shost->host_lock, flags); } static inline bool scsi_device_is_busy(struct scsi_device *sdev) { if (scsi_device_busy(sdev) >= sdev->queue_depth) return true; if (atomic_read(&sdev->device_blocked) > 0) return true; return false; } static inline bool scsi_target_is_busy(struct scsi_target *starget) { if (starget->can_queue > 0) { if (atomic_read(&starget->target_busy) >= starget->can_queue) return true; if (atomic_read(&starget->target_blocked) > 0) return true; } return false; } static inline bool scsi_host_is_busy(struct Scsi_Host *shost) { if (atomic_read(&shost->host_blocked) > 0) return true; if (shost->host_self_blocked) return true; return false; } static void scsi_starved_list_run(struct Scsi_Host *shost) { LIST_HEAD(starved_list); struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); list_splice_init(&shost->starved_list, &starved_list); while (!list_empty(&starved_list)) { struct request_queue *slq; /* * As long as shost is accepting commands and we have * starved queues, call blk_run_queue. scsi_request_fn * drops the queue_lock and can add us back to the * starved_list. * * host_lock protects the starved_list and starved_entry. * scsi_request_fn must get the host_lock before checking * or modifying starved_list or starved_entry. */ if (scsi_host_is_busy(shost)) break; sdev = list_entry(starved_list.next, struct scsi_device, starved_entry); list_del_init(&sdev->starved_entry); if (scsi_target_is_busy(scsi_target(sdev))) { list_move_tail(&sdev->starved_entry, &shost->starved_list); continue; } /* * Once we drop the host lock, a racing scsi_remove_device() * call may remove the sdev from the starved list and destroy * it and the queue. Mitigate by taking a reference to the * queue and never touching the sdev again after we drop the * host lock. Note: if __scsi_remove_device() invokes * blk_mq_destroy_queue() before the queue is run from this * function then blk_run_queue() will return immediately since * blk_mq_destroy_queue() marks the queue with QUEUE_FLAG_DYING. */ slq = sdev->request_queue; if (!blk_get_queue(slq)) continue; spin_unlock_irqrestore(shost->host_lock, flags); blk_mq_run_hw_queues(slq, false); blk_put_queue(slq); spin_lock_irqsave(shost->host_lock, flags); } /* put any unprocessed entries back */ list_splice(&starved_list, &shost->starved_list); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_run_queue - Select a proper request queue to serve next. * @q: last request's queue * * The previous command was completely finished, start a new one if possible. */ static void scsi_run_queue(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; if (scsi_target(sdev)->single_lun) scsi_single_lun_run(sdev); if (!list_empty(&sdev->host->starved_list)) scsi_starved_list_run(sdev->host); /* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */ blk_mq_kick_requeue_list(q); } void scsi_requeue_run_queue(struct work_struct *work) { struct scsi_device *sdev; struct request_queue *q; sdev = container_of(work, struct scsi_device, requeue_work); q = sdev->request_queue; scsi_run_queue(q); } void scsi_run_host_queues(struct Scsi_Host *shost) { struct scsi_device *sdev; shost_for_each_device(sdev, shost) scsi_run_queue(sdev->request_queue); } static void scsi_uninit_cmd(struct scsi_cmnd *cmd) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) { struct scsi_driver *drv = scsi_cmd_to_driver(cmd); if (drv->uninit_command) drv->uninit_command(cmd); } } void scsi_free_sgtables(struct scsi_cmnd *cmd) { if (cmd->sdb.table.nents) sg_free_table_chained(&cmd->sdb.table, SCSI_INLINE_SG_CNT); if (scsi_prot_sg_count(cmd)) sg_free_table_chained(&cmd->prot_sdb->table, SCSI_INLINE_PROT_SG_CNT); } EXPORT_SYMBOL_GPL(scsi_free_sgtables); static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd) { scsi_free_sgtables(cmd); scsi_uninit_cmd(cmd); } static void scsi_run_queue_async(struct scsi_device *sdev) { if (scsi_host_in_recovery(sdev->host)) return; if (scsi_target(sdev)->single_lun || !list_empty(&sdev->host->starved_list)) { kblockd_schedule_work(&sdev->requeue_work); } else { /* * smp_mb() present in sbitmap_queue_clear() or implied in * .end_io is for ordering writing .device_busy in * scsi_device_unbusy() and reading sdev->restarts. */ int old = atomic_read(&sdev->restarts); /* * ->restarts has to be kept as non-zero if new budget * contention occurs. * * No need to run queue when either another re-run * queue wins in updating ->restarts or a new budget * contention occurs. */ if (old && atomic_cmpxchg(&sdev->restarts, old, 0) == old) blk_mq_run_hw_queues(sdev->request_queue, true); } } /* Returns false when no more bytes to process, true if there are more */ static bool scsi_end_request(struct request *req, blk_status_t error, unsigned int bytes) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = cmd->device; struct request_queue *q = sdev->request_queue; if (blk_update_request(req, error, bytes)) return true; // XXX: if (blk_queue_add_random(q)) add_disk_randomness(req->q->disk); if (!blk_rq_is_passthrough(req)) { WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); cmd->flags &= ~SCMD_INITIALIZED; } /* * Calling rcu_barrier() is not necessary here because the * SCSI error handler guarantees that the function called by * call_rcu() has been called before scsi_end_request() is * called. */ destroy_rcu_head(&cmd->rcu); /* * In the MQ case the command gets freed by __blk_mq_end_request, * so we have to do all cleanup that depends on it earlier. * * We also can't kick the queues from irq context, so we * will have to defer it to a workqueue. */ scsi_mq_uninit_cmd(cmd); /* * queue is still alive, so grab the ref for preventing it * from being cleaned up during running queue. */ percpu_ref_get(&q->q_usage_counter); __blk_mq_end_request(req, error); scsi_run_queue_async(sdev); percpu_ref_put(&q->q_usage_counter); return false; } /** * scsi_result_to_blk_status - translate a SCSI result code into blk_status_t * @result: scsi error code * * Translate a SCSI result code into a blk_status_t value. */ static blk_status_t scsi_result_to_blk_status(int result) { /* * Check the scsi-ml byte first in case we converted a host or status * byte. */ switch (scsi_ml_byte(result)) { case SCSIML_STAT_OK: break; case SCSIML_STAT_RESV_CONFLICT: return BLK_STS_RESV_CONFLICT; case SCSIML_STAT_NOSPC: return BLK_STS_NOSPC; case SCSIML_STAT_MED_ERROR: return BLK_STS_MEDIUM; case SCSIML_STAT_TGT_FAILURE: return BLK_STS_TARGET; case SCSIML_STAT_DL_TIMEOUT: return BLK_STS_DURATION_LIMIT; } switch (host_byte(result)) { case DID_OK: if (scsi_status_is_good(result)) return BLK_STS_OK; return BLK_STS_IOERR; case DID_TRANSPORT_FAILFAST: case DID_TRANSPORT_MARGINAL: return BLK_STS_TRANSPORT; default: return BLK_STS_IOERR; } } /** * scsi_rq_err_bytes - determine number of bytes till the next failure boundary * @rq: request to examine * * Description: * A request could be merge of IOs which require different failure * handling. This function determines the number of bytes which * can be failed from the beginning of the request without * crossing into area which need to be retried further. * * Return: * The number of bytes to fail. */ static unsigned int scsi_rq_err_bytes(const struct request *rq) { blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK; unsigned int bytes = 0; struct bio *bio; if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* * Currently the only 'mixing' which can happen is between * different fastfail types. We can safely fail portions * which have all the failfast bits that the first one has - * the ones which are at least as eager to fail as the first * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_opf & ff) != ff) break; bytes += bio->bi_iter.bi_size; } /* this could lead to infinite loop */ BUG_ON(blk_rq_bytes(rq) && !bytes); return bytes; } static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd) { struct request *req = scsi_cmd_to_rq(cmd); unsigned long wait_for; if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) return false; wait_for = (cmd->allowed + 1) * req->timeout; if (time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) { scmd_printk(KERN_ERR, cmd, "timing out command, waited %lus\n", wait_for/HZ); return true; } return false; } /* * When ALUA transition state is returned, reprep the cmd to * use the ALUA handler's transition timeout. Delay the reprep * 1 sec to avoid aggressive retries of the target in that * state. */ #define ALUA_TRANSITION_REPREP_DELAY 1000 /* Helper for scsi_io_completion() when special action required. */ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) { struct request *req = scsi_cmd_to_rq(cmd); int level = 0; enum {ACTION_FAIL, ACTION_REPREP, ACTION_DELAYED_REPREP, ACTION_RETRY, ACTION_DELAYED_RETRY} action; struct scsi_sense_hdr sshdr; bool sense_valid; bool sense_current = true; /* false implies "deferred sense" */ blk_status_t blk_stat; sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) sense_current = !scsi_sense_is_deferred(&sshdr); blk_stat = scsi_result_to_blk_status(result); if (host_byte(result) == DID_RESET) { /* Third party bus reset or reset for error recovery * reasons. Just retry the command and see what * happens. */ action = ACTION_RETRY; } else if (sense_valid && sense_current) { switch (sshdr.sense_key) { case UNIT_ATTENTION: if (cmd->device->removable) { /* Detected disc change. Set a bit * and quietly refuse further access. */ cmd->device->changed = 1; action = ACTION_FAIL; } else { /* Must have been a power glitch, or a * bus reset. Could not have been a * media change, so we just retry the * command and see what happens. */ action = ACTION_RETRY; } break; case ILLEGAL_REQUEST: /* If we had an ILLEGAL REQUEST returned, then * we may have performed an unsupported * command. The only thing this should be * would be a ten byte read where only a six * byte read was supported. Also, on a system * where READ CAPACITY failed, we may have * read past the end of the disk. */ if ((cmd->device->use_10_for_rw && sshdr.asc == 0x20 && sshdr.ascq == 0x00) && (cmd->cmnd[0] == READ_10 || cmd->cmnd[0] == WRITE_10)) { /* This will issue a new 6-byte command. */ cmd->device->use_10_for_rw = 0; action = ACTION_REPREP; } else if (sshdr.asc == 0x10) /* DIX */ { action = ACTION_FAIL; blk_stat = BLK_STS_PROTECTION; /* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */ } else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) { action = ACTION_FAIL; blk_stat = BLK_STS_TARGET; } else action = ACTION_FAIL; break; case ABORTED_COMMAND: action = ACTION_FAIL; if (sshdr.asc == 0x10) /* DIF */ blk_stat = BLK_STS_PROTECTION; break; case NOT_READY: /* If the device is in the process of becoming * ready, or has a temporary blockage, retry. */ if (sshdr.asc == 0x04) { switch (sshdr.ascq) { case 0x01: /* becoming ready */ case 0x04: /* format in progress */ case 0x05: /* rebuild in progress */ case 0x06: /* recalculation in progress */ case 0x07: /* operation in progress */ case 0x08: /* Long write in progress */ case 0x09: /* self test in progress */ case 0x11: /* notify (enable spinup) required */ case 0x14: /* space allocation in progress */ case 0x1a: /* start stop unit in progress */ case 0x1b: /* sanitize in progress */ case 0x1d: /* configuration in progress */ case 0x24: /* depopulation in progress */ case 0x25: /* depopulation restore in progress */ action = ACTION_DELAYED_RETRY; break; case 0x0a: /* ALUA state transition */ action = ACTION_DELAYED_REPREP; break; default: action = ACTION_FAIL; break; } } else action = ACTION_FAIL; break; case VOLUME_OVERFLOW: /* See SSC3rXX or current. */ action = ACTION_FAIL; break; case DATA_PROTECT: action = ACTION_FAIL; if ((sshdr.asc == 0x0C && sshdr.ascq == 0x12) || (sshdr.asc == 0x55 && (sshdr.ascq == 0x0E || sshdr.ascq == 0x0F))) { /* Insufficient zone resources */ blk_stat = BLK_STS_ZONE_OPEN_RESOURCE; } break; case COMPLETED: fallthrough; default: action = ACTION_FAIL; break; } } else action = ACTION_FAIL; if (action != ACTION_FAIL && scsi_cmd_runtime_exceeced(cmd)) action = ACTION_FAIL; switch (action) { case ACTION_FAIL: /* Give up and fail the remainder of the request */ if (!(req->rq_flags & RQF_QUIET)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); if (unlikely(scsi_logging_level)) level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); /* * if logging is enabled the failure will be printed * in scsi_log_completion(), so avoid duplicate messages */ if (!level && __ratelimit(&_rs)) { scsi_print_result(cmd, NULL, FAILED); if (sense_valid) scsi_print_sense(cmd); scsi_print_command(cmd); } } if (!scsi_end_request(req, blk_stat, scsi_rq_err_bytes(req))) return; fallthrough; case ACTION_REPREP: scsi_mq_requeue_cmd(cmd, 0); break; case ACTION_DELAYED_REPREP: scsi_mq_requeue_cmd(cmd, ALUA_TRANSITION_REPREP_DELAY); break; case ACTION_RETRY: /* Retry the same command immediately */ __scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY, false); break; case ACTION_DELAYED_RETRY: /* Retry the same command after a delay */ __scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY, false); break; } } /* * Helper for scsi_io_completion() when cmd->result is non-zero. Returns a * new result that may suppress further error checking. Also modifies * *blk_statp in some cases. */ static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result, blk_status_t *blk_statp) { bool sense_valid; bool sense_current = true; /* false implies "deferred sense" */ struct request *req = scsi_cmd_to_rq(cmd); struct scsi_sense_hdr sshdr; sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) sense_current = !scsi_sense_is_deferred(&sshdr); if (blk_rq_is_passthrough(req)) { if (sense_valid) { /* * SG_IO wants current and deferred errors */ cmd->sense_len = min(8 + cmd->sense_buffer[7], SCSI_SENSE_BUFFERSIZE); } if (sense_current) *blk_statp = scsi_result_to_blk_status(result); } else if (blk_rq_bytes(req) == 0 && sense_current) { /* * Flush commands do not transfers any data, and thus cannot use * good_bytes != blk_rq_bytes(req) as the signal for an error. * This sets *blk_statp explicitly for the problem case. */ *blk_statp = scsi_result_to_blk_status(result); } /* * Recovered errors need reporting, but they're always treated as * success, so fiddle the result code here. For passthrough requests * we already took a copy of the original into sreq->result which * is what gets returned to the user */ if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) { bool do_print = true; /* * if ATA PASS-THROUGH INFORMATION AVAILABLE [0x0, 0x1d] * skip print since caller wants ATA registers. Only occurs * on SCSI ATA PASS_THROUGH commands when CK_COND=1 */ if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d)) do_print = false; else if (req->rq_flags & RQF_QUIET) do_print = false; if (do_print) scsi_print_sense(cmd); result = 0; /* for passthrough, *blk_statp may be set */ *blk_statp = BLK_STS_OK; } /* * Another corner case: the SCSI status byte is non-zero but 'good'. * Example: PRE-FETCH command returns SAM_STAT_CONDITION_MET when * it is able to fit nominated LBs in its cache (and SAM_STAT_GOOD * if it can't fit). Treat SAM_STAT_CONDITION_MET and the related * intermediate statuses (both obsolete in SAM-4) as good. */ if ((result & 0xff) && scsi_status_is_good(result)) { result = 0; *blk_statp = BLK_STS_OK; } return result; } /** * scsi_io_completion - Completion processing for SCSI commands. * @cmd: command that is finished. * @good_bytes: number of processed bytes. * * We will finish off the specified number of sectors. If we are done, the * command block will be released and the queue function will be goosed. If we * are not done then we have to figure out what to do next: * * a) We can call scsi_mq_requeue_cmd(). The request will be * unprepared and put back on the queue. Then a new command will * be created for it. This should be used if we made forward * progress, or if we want to switch from READ(10) to READ(6) for * example. * * b) We can call scsi_io_completion_action(). The request will be * put back on the queue and retried using the same command as * before, possibly after a delay. * * c) We can call scsi_end_request() with blk_stat other than * BLK_STS_OK, to fail the remainder of the request. */ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) { int result = cmd->result; struct request *req = scsi_cmd_to_rq(cmd); blk_status_t blk_stat = BLK_STS_OK; if (unlikely(result)) /* a nz result may or may not be an error */ result = scsi_io_completion_nz_result(cmd, result, &blk_stat); /* * Next deal with any sectors which we were able to correctly * handle. */ SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, cmd, "%u sectors total, %d bytes done.\n", blk_rq_sectors(req), good_bytes)); /* * Failed, zero length commands always need to drop down * to retry code. Fast path should return in this block. */ if (likely(blk_rq_bytes(req) > 0 || blk_stat == BLK_STS_OK)) { if (likely(!scsi_end_request(req, blk_stat, good_bytes))) return; /* no bytes remaining */ } /* Kill remainder if no retries. */ if (unlikely(blk_stat && scsi_noretry_cmd(cmd))) { if (scsi_end_request(req, blk_stat, blk_rq_bytes(req))) WARN_ONCE(true, "Bytes remaining after failed, no-retry command"); return; } /* * If there had been no error, but we have leftover bytes in the * request just queue the command up again. */ if (likely(result == 0)) scsi_mq_requeue_cmd(cmd, 0); else scsi_io_completion_action(cmd, result); } static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, struct request *rq) { return sdev->dma_drain_len && blk_rq_is_passthrough(rq) && !op_is_write(req_op(rq)) && sdev->host->hostt->dma_need_drain(rq); } /** * scsi_alloc_sgtables - Allocate and initialize data and integrity scatterlists * @cmd: SCSI command data structure to initialize. * * Initializes @cmd->sdb and also @cmd->prot_sdb if data integrity is enabled * for @cmd. * * Returns: * * BLK_STS_OK - on success * * BLK_STS_RESOURCE - if the failure is retryable * * BLK_STS_IOERR - if the failure is fatal */ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); unsigned short nr_segs = blk_rq_nr_phys_segments(rq); struct scatterlist *last_sg = NULL; blk_status_t ret; bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq); int count; if (WARN_ON_ONCE(!nr_segs)) return BLK_STS_IOERR; /* * Make sure there is space for the drain. The driver must adjust * max_hw_segments to be prepared for this. */ if (need_drain) nr_segs++; /* * If sg table allocation fails, requeue request later. */ if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs, cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT))) return BLK_STS_RESOURCE; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) { unsigned int pad_len = (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; last_sg->length += pad_len; cmd->extra_len += pad_len; } if (need_drain) { sg_unmark_end(last_sg); last_sg = sg_next(last_sg); sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); sg_mark_end(last_sg); cmd->extra_len += sdev->dma_drain_len; count++; } BUG_ON(count > cmd->sdb.table.nents); cmd->sdb.table.nents = count; cmd->sdb.length = blk_rq_payload_bytes(rq); if (blk_integrity_rq(rq)) { struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; int ivecs; if (WARN_ON_ONCE(!prot_sdb)) { /* * This can happen if someone (e.g. multipath) * queues a command to a device on an adapter * that does not support DIX. */ ret = BLK_STS_IOERR; goto out_free_sgtables; } ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); if (sg_alloc_table_chained(&prot_sdb->table, ivecs, prot_sdb->table.sgl, SCSI_INLINE_PROT_SG_CNT)) { ret = BLK_STS_RESOURCE; goto out_free_sgtables; } count = blk_rq_map_integrity_sg(rq->q, rq->bio, prot_sdb->table.sgl); BUG_ON(count > ivecs); BUG_ON(count > queue_max_integrity_segments(rq->q)); cmd->prot_sdb = prot_sdb; cmd->prot_sdb->table.nents = count; } return BLK_STS_OK; out_free_sgtables: scsi_free_sgtables(cmd); return ret; } EXPORT_SYMBOL(scsi_alloc_sgtables); /** * scsi_initialize_rq - initialize struct scsi_cmnd partially * @rq: Request associated with the SCSI command to be initialized. * * This function initializes the members of struct scsi_cmnd that must be * initialized before request processing starts and that won't be * reinitialized if a SCSI command is requeued. */ static void scsi_initialize_rq(struct request *rq) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); memset(cmd->cmnd, 0, sizeof(cmd->cmnd)); cmd->cmd_len = MAX_COMMAND_SIZE; cmd->sense_len = 0; init_rcu_head(&cmd->rcu); cmd->jiffies_at_alloc = jiffies; cmd->retries = 0; } struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct request *rq; rq = blk_mq_alloc_request(q, opf, flags); if (!IS_ERR(rq)) scsi_initialize_rq(rq); return rq; } EXPORT_SYMBOL_GPL(scsi_alloc_request); /* * Only called when the request isn't completed by SCSI, and not freed by * SCSI */ static void scsi_cleanup_rq(struct request *rq) { if (rq->rq_flags & RQF_DONTPREP) { scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq)); rq->rq_flags &= ~RQF_DONTPREP; } } /* Called before a request is prepared. See also scsi_mq_prep_fn(). */ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); if (!blk_rq_is_passthrough(rq) && !(cmd->flags & SCMD_INITIALIZED)) { cmd->flags |= SCMD_INITIALIZED; scsi_initialize_rq(rq); } cmd->device = dev; INIT_LIST_HEAD(&cmd->eh_entry); INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler); } static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); /* * Passthrough requests may transfer data, in which case they must * a bio attached to them. Or they might contain a SCSI command * that does not transfer data, in which case they may optionally * submit a request without an attached bio. */ if (req->bio) { blk_status_t ret = scsi_alloc_sgtables(cmd); if (unlikely(ret != BLK_STS_OK)) return ret; } else { BUG_ON(blk_rq_bytes(req)); memset(&cmd->sdb, 0, sizeof(cmd->sdb)); } cmd->transfersize = blk_rq_bytes(req); return BLK_STS_OK; } static blk_status_t scsi_device_state_check(struct scsi_device *sdev, struct request *req) { switch (sdev->sdev_state) { case SDEV_CREATED: return BLK_STS_OK; case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: /* * If the device is offline we refuse to process any * commands. The device must be brought online * before trying any recovery commands. */ if (!sdev->offline_already) { sdev->offline_already = true; sdev_printk(KERN_ERR, sdev, "rejecting I/O to offline device\n"); } return BLK_STS_IOERR; case SDEV_DEL: /* * If the device is fully deleted, we refuse to * process any commands as well. */ sdev_printk(KERN_ERR, sdev, "rejecting I/O to dead device\n"); return BLK_STS_IOERR; case SDEV_BLOCK: case SDEV_CREATED_BLOCK: return BLK_STS_RESOURCE; case SDEV_QUIESCE: /* * If the device is blocked we only accept power management * commands. */ if (req && WARN_ON_ONCE(!(req->rq_flags & RQF_PM))) return BLK_STS_RESOURCE; return BLK_STS_OK; default: /* * For any other not fully online state we only allow * power management commands. */ if (req && !(req->rq_flags & RQF_PM)) return BLK_STS_OFFLINE; return BLK_STS_OK; } } /* * scsi_dev_queue_ready: if we can send requests to sdev, assign one token * and return the token else return -1. */ static inline int scsi_dev_queue_ready(struct request_queue *q, struct scsi_device *sdev) { int token; token = sbitmap_get(&sdev->budget_map); if (token < 0) return -1; if (!atomic_read(&sdev->device_blocked)) return token; /* * Only unblock if no other commands are pending and * if device_blocked has decreased to zero */ if (scsi_device_busy(sdev) > 1 || atomic_dec_return(&sdev->device_blocked) > 0) { sbitmap_put(&sdev->budget_map, token); return -1; } SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev, "unblocking device at zero depth\n")); return token; } /* * scsi_target_queue_ready: checks if there we can send commands to target * @sdev: scsi device on starget to check. */ static inline int scsi_target_queue_ready(struct Scsi_Host *shost, struct scsi_device *sdev) { struct scsi_target *starget = scsi_target(sdev); unsigned int busy; if (starget->single_lun) { spin_lock_irq(shost->host_lock); if (starget->starget_sdev_user && starget->starget_sdev_user != sdev) { spin_unlock_irq(shost->host_lock); return 0; } starget->starget_sdev_user = sdev; spin_unlock_irq(shost->host_lock); } if (starget->can_queue <= 0) return 1; busy = atomic_inc_return(&starget->target_busy) - 1; if (atomic_read(&starget->target_blocked) > 0) { if (busy) goto starved; /* * unblock after target_blocked iterates to zero */ if (atomic_dec_return(&starget->target_blocked) > 0) goto out_dec; SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget, "unblocking target at zero depth\n")); } if (busy >= starget->can_queue) goto starved; return 1; starved: spin_lock_irq(shost->host_lock); list_move_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock); out_dec: if (starget->can_queue > 0) atomic_dec(&starget->target_busy); return 0; } /* * scsi_host_queue_ready: if we can send requests to shost, return 1 else * return 0. We must end up running the queue again whenever 0 is * returned, else IO can hang. */ static inline int scsi_host_queue_ready(struct request_queue *q, struct Scsi_Host *shost, struct scsi_device *sdev, struct scsi_cmnd *cmd) { if (atomic_read(&shost->host_blocked) > 0) { if (scsi_host_busy(shost) > 0) goto starved; /* * unblock after host_blocked iterates to zero */ if (atomic_dec_return(&shost->host_blocked) > 0) goto out_dec; SCSI_LOG_MLQUEUE(3, shost_printk(KERN_INFO, shost, "unblocking host at zero depth\n")); } if (shost->host_self_blocked) goto starved; /* We're OK to process the command, so we can't be starved */ if (!list_empty(&sdev->starved_entry)) { spin_lock_irq(shost->host_lock); if (!list_empty(&sdev->starved_entry)) list_del_init(&sdev->starved_entry); spin_unlock_irq(shost->host_lock); } __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); return 1; starved: spin_lock_irq(shost->host_lock); if (list_empty(&sdev->starved_entry)) list_add_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock); out_dec: scsi_dec_host_busy(shost, cmd); return 0; } /* * Busy state exporting function for request stacking drivers. * * For efficiency, no lock is taken to check the busy state of * shost/starget/sdev, since the returned value is not guaranteed and * may be changed after request stacking drivers call the function, * regardless of taking lock or not. * * When scsi can't dispatch I/Os anymore and needs to kill I/Os scsi * needs to return 'not busy'. Otherwise, request stacking drivers * may hold requests forever. */ static bool scsi_mq_lld_busy(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost; if (blk_queue_dying(q)) return false; shost = sdev->host; /* * Ignore host/starget busy state. * Since block layer does not have a concept of fairness across * multiple queues, congestion of host/starget needs to be handled * in SCSI layer. */ if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev)) return true; return false; } /* * Block layer request completion callback. May be called from interrupt * context. */ static void scsi_complete(struct request *rq) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); enum scsi_disposition disposition; INIT_LIST_HEAD(&cmd->eh_entry); atomic_inc(&cmd->device->iodone_cnt); if (cmd->result) atomic_inc(&cmd->device->ioerr_cnt); disposition = scsi_decide_disposition(cmd); if (disposition != SUCCESS && scsi_cmd_runtime_exceeced(cmd)) disposition = SUCCESS; scsi_log_completion(cmd, disposition); switch (disposition) { case SUCCESS: scsi_finish_command(cmd); break; case NEEDS_RETRY: scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY); break; case ADD_TO_MLQUEUE: scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY); break; default: scsi_eh_scmd_add(cmd); break; } } /** * scsi_dispatch_cmd - Dispatch a command to the low-level driver. * @cmd: command block we are dispatching. * * Return: nonzero return request was rejected and device's queue needs to be * plugged. */ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd) { struct Scsi_Host *host = cmd->device->host; int rtn = 0; atomic_inc(&cmd->device->iorequest_cnt); /* check if the device is still usable */ if (unlikely(cmd->device->sdev_state == SDEV_DEL)) { /* in SDEV_DEL we error all commands. DID_NO_CONNECT * returns an immediate error upwards, and signals * that the device is no longer present */ cmd->result = DID_NO_CONNECT << 16; goto done; } /* Check to see if the scsi lld made this device blocked. */ if (unlikely(scsi_device_blocked(cmd->device))) { /* * in blocked state, the command is just put back on * the device queue. The suspend state has already * blocked the queue so future requests should not * occur until the device transitions out of the * suspend state. */ SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : device blocked\n")); atomic_dec(&cmd->device->iorequest_cnt); return SCSI_MLQUEUE_DEVICE_BUSY; } /* Store the LUN value in cmnd, if needed. */ if (cmd->device->lun_in_cdb) cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) | (cmd->device->lun << 5 & 0xe0); scsi_log_send(cmd); /* * Before we queue this command, check if the command * length exceeds what the host adapter can handle. */ if (cmd->cmd_len > cmd->device->host->max_cmd_len) { SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : command too long. " "cdb_size=%d host->max_cmd_len=%d\n", cmd->cmd_len, cmd->device->host->max_cmd_len)); cmd->result = (DID_ABORT << 16); goto done; } if (unlikely(host->shost_state == SHOST_DEL)) { cmd->result = (DID_NO_CONNECT << 16); goto done; } trace_scsi_dispatch_cmd_start(cmd); rtn = host->hostt->queuecommand(host, cmd); if (rtn) { atomic_dec(&cmd->device->iorequest_cnt); trace_scsi_dispatch_cmd_error(cmd, rtn); if (rtn != SCSI_MLQUEUE_DEVICE_BUSY && rtn != SCSI_MLQUEUE_TARGET_BUSY) rtn = SCSI_MLQUEUE_HOST_BUSY; SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : request rejected\n")); } return rtn; done: scsi_done(cmd); return 0; } /* Size in bytes of the sg-list stored in the scsi-mq command-private data. */ static unsigned int scsi_mq_inline_sgl_size(struct Scsi_Host *shost) { return min_t(unsigned int, shost->sg_tablesize, SCSI_INLINE_SG_CNT) * sizeof(struct scatterlist); } static blk_status_t scsi_prepare_cmd(struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = req->q->queuedata; struct Scsi_Host *shost = sdev->host; bool in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state); struct scatterlist *sg; scsi_init_command(sdev, cmd); cmd->eh_eflags = 0; cmd->prot_type = 0; cmd->prot_flags = 0; cmd->submitter = 0; memset(&cmd->sdb, 0, sizeof(cmd->sdb)); cmd->underflow = 0; cmd->transfersize = 0; cmd->host_scribble = NULL; cmd->result = 0; cmd->extra_len = 0; cmd->state = 0; if (in_flight) __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); /* * Only clear the driver-private command data if the LLD does not supply * a function to initialize that data. */ if (!shost->hostt->init_cmd_priv) memset(cmd + 1, 0, shost->hostt->cmd_size); cmd->prot_op = SCSI_PROT_NORMAL; if (blk_rq_bytes(req)) cmd->sc_data_direction = rq_dma_dir(req); else cmd->sc_data_direction = DMA_NONE; sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->sdb.table.sgl = sg; if (scsi_host_get_prot(shost)) { memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer)); cmd->prot_sdb->table.sgl = (struct scatterlist *)(cmd->prot_sdb + 1); } /* * Special handling for passthrough commands, which don't go to the ULP * at all: */ if (blk_rq_is_passthrough(req)) return scsi_setup_scsi_cmnd(sdev, req); if (sdev->handler && sdev->handler->prep_fn) { blk_status_t ret = sdev->handler->prep_fn(sdev, req); if (ret != BLK_STS_OK) return ret; } /* Usually overridden by the ULP */ cmd->allowed = 0; memset(cmd->cmnd, 0, sizeof(cmd->cmnd)); return scsi_cmd_to_driver(cmd)->init_command(cmd); } static void scsi_done_internal(struct scsi_cmnd *cmd, bool complete_directly) { struct request *req = scsi_cmd_to_rq(cmd); switch (cmd->submitter) { case SUBMITTED_BY_BLOCK_LAYER: break; case SUBMITTED_BY_SCSI_ERROR_HANDLER: return scsi_eh_done(cmd); case SUBMITTED_BY_SCSI_RESET_IOCTL: return; } if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q))) return; if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state))) return; trace_scsi_dispatch_cmd_done(cmd); if (complete_directly) blk_mq_complete_request_direct(req, scsi_complete); else blk_mq_complete_request(req); } void scsi_done(struct scsi_cmnd *cmd) { scsi_done_internal(cmd, false); } EXPORT_SYMBOL(scsi_done); void scsi_done_direct(struct scsi_cmnd *cmd) { scsi_done_internal(cmd, true); } EXPORT_SYMBOL(scsi_done_direct); static void scsi_mq_put_budget(struct request_queue *q, int budget_token) { struct scsi_device *sdev = q->queuedata; sbitmap_put(&sdev->budget_map, budget_token); } /* * When to reinvoke queueing after a resource shortage. It's 3 msecs to * not change behaviour from the previous unplug mechanism, experimentation * may prove this needs changing. */ #define SCSI_QUEUE_DELAY 3 static int scsi_mq_get_budget(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; int token = scsi_dev_queue_ready(q, sdev); if (token >= 0) return token; atomic_inc(&sdev->restarts); /* * Orders atomic_inc(&sdev->restarts) and atomic_read(&sdev->device_busy). * .restarts must be incremented before .device_busy is read because the * code in scsi_run_queue_async() depends on the order of these operations. */ smp_mb__after_atomic(); /* * If all in-flight requests originated from this LUN are completed * before reading .device_busy, sdev->device_busy will be observed as * zero, then blk_mq_delay_run_hw_queues() will dispatch this request * soon. Otherwise, completion of one of these requests will observe * the .restarts flag, and the request queue will be run for handling * this request, see scsi_end_request(). */ if (unlikely(scsi_device_busy(sdev) == 0 && !scsi_device_blocked(sdev))) blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY); return -1; } static void scsi_mq_set_rq_budget_token(struct request *req, int token) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); cmd->budget_token = token; } static int scsi_mq_get_rq_budget_token(struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); return cmd->budget_token; } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; int reason; WARN_ON_ONCE(cmd->budget_token < 0); /* * If the device is not in running state we will reject some or all * commands. */ if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { ret = scsi_device_state_check(sdev, req); if (ret != BLK_STS_OK) goto out_put_budget; } ret = BLK_STS_RESOURCE; if (!scsi_target_queue_ready(shost, sdev)) goto out_put_budget; if (unlikely(scsi_host_in_recovery(shost))) { if (cmd->flags & SCMD_FAIL_IF_RECOVERING) ret = BLK_STS_OFFLINE; goto out_dec_target_busy; } if (!scsi_host_queue_ready(q, shost, sdev, cmd)) goto out_dec_target_busy; if (!(req->rq_flags & RQF_DONTPREP)) { ret = scsi_prepare_cmd(req); if (ret != BLK_STS_OK) goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { clear_bit(SCMD_STATE_COMPLETE, &cmd->state); } cmd->flags &= SCMD_PRESERVED_FLAGS; if (sdev->simple_tags) cmd->flags |= SCMD_TAGGED; if (bd->last) cmd->flags |= SCMD_LAST; scsi_set_resid(cmd, 0); memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); cmd->submitter = SUBMITTED_BY_BLOCK_LAYER; blk_mq_start_request(req); reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); ret = BLK_STS_RESOURCE; goto out_dec_host_busy; } return BLK_STS_OK; out_dec_host_busy: scsi_dec_host_busy(shost, cmd); out_dec_target_busy: if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: scsi_mq_put_budget(q, cmd->budget_token); cmd->budget_token = -1; switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: case BLK_STS_ZONE_RESOURCE: if (scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; case BLK_STS_AGAIN: cmd->result = DID_BUS_BUSY << 16; if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); break; default: if (unlikely(!scsi_device_online(sdev))) cmd->result = DID_NO_CONNECT << 16; else cmd->result = DID_ERROR << 16; /* * Make sure to release all allocated resources when * we hit an error, as we will never see this command * again. */ if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); scsi_run_queue_async(sdev); break; } return ret; } static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { struct Scsi_Host *shost = set->driver_data; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); struct scatterlist *sg; int ret = 0; cmd->sense_buffer = kmem_cache_alloc_node(scsi_sense_cache, GFP_KERNEL, numa_node); if (!cmd->sense_buffer) return -ENOMEM; if (scsi_host_get_prot(shost)) { sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost); } if (shost->hostt->init_cmd_priv) { ret = shost->hostt->init_cmd_priv(shost, cmd); if (ret < 0) kmem_cache_free(scsi_sense_cache, cmd->sense_buffer); } return ret; } static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { struct Scsi_Host *shost = set->driver_data; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); if (shost->hostt->exit_cmd_priv) shost->hostt->exit_cmd_priv(shost, cmd); kmem_cache_free(scsi_sense_cache, cmd->sense_buffer); } static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct Scsi_Host *shost = hctx->driver_data; if (shost->hostt->mq_poll) return shost->hostt->mq_poll(shost, hctx->queue_num); return 0; } static int scsi_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct Scsi_Host *shost = data; hctx->driver_data = shost; return 0; } static void scsi_map_queues(struct blk_mq_tag_set *set) { struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set); if (shost->hostt->map_queues) return shost->hostt->map_queues(shost); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) { struct device *dev = shost->dma_dev; /* * this limit is imposed by hardware restrictions */ blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize, SG_MAX_SEGMENTS)); if (scsi_host_prot_dma(shost)) { shost->sg_prot_tablesize = min_not_zero(shost->sg_prot_tablesize, (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS); BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize); blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize); } blk_queue_max_hw_sectors(q, shost->max_sectors); blk_queue_segment_boundary(q, shost->dma_boundary); dma_set_seg_boundary(dev, shost->dma_boundary); blk_queue_max_segment_size(q, shost->max_segment_size); blk_queue_virt_boundary(q, shost->virt_boundary_mask); dma_set_max_seg_size(dev, queue_max_segment_size(q)); /* * Set a reasonable default alignment: The larger of 32-byte (dword), * which is a common minimum for HBAs, and the minimum DMA alignment, * which is set by the platform. * * Devices that require a bigger alignment can increase it later. */ blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment()) - 1); } EXPORT_SYMBOL_GPL(__scsi_init_queue); static const struct blk_mq_ops scsi_mq_ops_no_commit = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .complete = scsi_complete, .timeout = scsi_timeout, #ifdef CONFIG_BLK_DEBUG_FS .show_rq = scsi_show_rq, #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, .init_hctx = scsi_init_hctx, .poll = scsi_mq_poll, .set_rq_budget_token = scsi_mq_set_rq_budget_token, .get_rq_budget_token = scsi_mq_get_rq_budget_token, }; static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx) { struct Scsi_Host *shost = hctx->driver_data; shost->hostt->commit_rqs(shost, hctx->queue_num); } static const struct blk_mq_ops scsi_mq_ops = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .commit_rqs = scsi_commit_rqs, .complete = scsi_complete, .timeout = scsi_timeout, #ifdef CONFIG_BLK_DEBUG_FS .show_rq = scsi_show_rq, #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, .init_hctx = scsi_init_hctx, .poll = scsi_mq_poll, .set_rq_budget_token = scsi_mq_set_rq_budget_token, .get_rq_budget_token = scsi_mq_get_rq_budget_token, }; int scsi_mq_setup_tags(struct Scsi_Host *shost) { unsigned int cmd_size, sgl_size; struct blk_mq_tag_set *tag_set = &shost->tag_set; sgl_size = max_t(unsigned int, sizeof(struct scatterlist), scsi_mq_inline_sgl_size(shost)); cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size; if (scsi_host_get_prot(shost)) cmd_size += sizeof(struct scsi_data_buffer) + sizeof(struct scatterlist) * SCSI_INLINE_PROT_SG_CNT; memset(tag_set, 0, sizeof(*tag_set)); if (shost->hostt->commit_rqs) tag_set->ops = &scsi_mq_ops; else tag_set->ops = &scsi_mq_ops_no_commit; tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1; tag_set->nr_maps = shost->nr_maps ? : 1; tag_set->queue_depth = shost->can_queue; tag_set->cmd_size = cmd_size; tag_set->numa_node = dev_to_node(shost->dma_dev); tag_set->flags = BLK_MQ_F_SHOULD_MERGE; tag_set->flags |= BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); if (shost->queuecommand_may_block) tag_set->flags |= BLK_MQ_F_BLOCKING; tag_set->driver_data = shost; if (shost->host_tagset) tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; return blk_mq_alloc_tag_set(tag_set); } void scsi_mq_free_tags(struct kref *kref) { struct Scsi_Host *shost = container_of(kref, typeof(*shost), tagset_refcnt); blk_mq_free_tag_set(&shost->tag_set); complete(&shost->tagset_freed); } /** * scsi_device_from_queue - return sdev associated with a request_queue * @q: The request queue to return the sdev from * * Return the sdev associated with a request queue or NULL if the * request_queue does not reference a SCSI device. */ struct scsi_device *scsi_device_from_queue(struct request_queue *q) { struct scsi_device *sdev = NULL; if (q->mq_ops == &scsi_mq_ops_no_commit || q->mq_ops == &scsi_mq_ops) sdev = q->queuedata; if (!sdev || !get_device(&sdev->sdev_gendev)) sdev = NULL; return sdev; } /* * pktcdvd should have been integrated into the SCSI layers, but for historical * reasons like the old IDE driver it isn't. This export allows it to safely * probe if a given device is a SCSI one and only attach to that. */ #ifdef CONFIG_CDROM_PKTCDVD_MODULE EXPORT_SYMBOL_GPL(scsi_device_from_queue); #endif /** * scsi_block_requests - Utility function used by low-level drivers to prevent * further commands from being queued to the device. * @shost: host in question * * There is no timer nor any other means by which the requests get unblocked * other than the low-level driver calling scsi_unblock_requests(). */ void scsi_block_requests(struct Scsi_Host *shost) { shost->host_self_blocked = 1; } EXPORT_SYMBOL(scsi_block_requests); /** * scsi_unblock_requests - Utility function used by low-level drivers to allow * further commands to be queued to the device. * @shost: host in question * * There is no timer nor any other means by which the requests get unblocked * other than the low-level driver calling scsi_unblock_requests(). This is done * as an API function so that changes to the internals of the scsi mid-layer * won't require wholesale changes to drivers that use this feature. */ void scsi_unblock_requests(struct Scsi_Host *shost) { shost->host_self_blocked = 0; scsi_run_host_queues(shost); } EXPORT_SYMBOL(scsi_unblock_requests); void scsi_exit_queue(void) { kmem_cache_destroy(scsi_sense_cache); } /** * scsi_mode_select - issue a mode select * @sdev: SCSI device to be queried * @pf: Page format bit (1 == standard, 0 == vendor specific) * @sp: Save page bit (0 == don't save, 1 == save) * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout * @retries: number of retries before failing * @data: returns a structure abstracting the mode header data * @sshdr: place to put sense data (or NULL if no sense to be collected). * must be SCSI_SENSE_BUFFERSIZE big. * * Returns zero if successful; negative error number or scsi * status on error * */ int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[10]; unsigned char *real_buffer; const struct scsi_exec_args exec_args = { .sshdr = sshdr, }; int ret; memset(cmd, 0, sizeof(cmd)); cmd[1] = (pf ? 0x10 : 0) | (sp ? 0x01 : 0); /* * Use MODE SELECT(10) if the device asked for it or if the mode page * and the mode select header cannot fit within the maximumm 255 bytes * of the MODE SELECT(6) command. */ if (sdev->use_10_for_ms || len + 4 > 255 || data->block_descriptor_length > 255) { if (len > 65535 - 8) return -EINVAL; real_buffer = kmalloc(8 + len, GFP_KERNEL); if (!real_buffer) return -ENOMEM; memcpy(real_buffer + 8, buffer, len); len += 8; real_buffer[0] = 0; real_buffer[1] = 0; real_buffer[2] = data->medium_type; real_buffer[3] = data->device_specific; real_buffer[4] = data->longlba ? 0x01 : 0; real_buffer[5] = 0; put_unaligned_be16(data->block_descriptor_length, &real_buffer[6]); cmd[0] = MODE_SELECT_10; put_unaligned_be16(len, &cmd[7]); } else { if (data->longlba) return -EINVAL; real_buffer = kmalloc(4 + len, GFP_KERNEL); if (!real_buffer) return -ENOMEM; memcpy(real_buffer + 4, buffer, len); len += 4; real_buffer[0] = 0; real_buffer[1] = data->medium_type; real_buffer[2] = data->device_specific; real_buffer[3] = data->block_descriptor_length; cmd[0] = MODE_SELECT; cmd[4] = len; } ret = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, real_buffer, len, timeout, retries, &exec_args); kfree(real_buffer); return ret; } EXPORT_SYMBOL_GPL(scsi_mode_select); /** * scsi_mode_sense - issue a mode sense, falling back from 10 to six bytes if necessary. * @sdev: SCSI device to be queried * @dbd: set to prevent mode sense from returning block descriptors * @modepage: mode page being requested * @subpage: sub-page of the mode page being requested * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout * @retries: number of retries before failing * @data: returns a structure abstracting the mode header data * @sshdr: place to put sense data (or NULL if no sense to be collected). * must be SCSI_SENSE_BUFFERSIZE big. * * Returns zero if successful, or a negative error number on failure */ int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int subpage, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[12]; int use_10_for_ms; int header_length; int result, retry_count = retries; struct scsi_sense_hdr my_sshdr; const struct scsi_exec_args exec_args = { /* caller might not be interested in sense, but we need it */ .sshdr = sshdr ? : &my_sshdr, }; memset(data, 0, sizeof(*data)); memset(&cmd[0], 0, 12); dbd = sdev->set_dbd_for_ms ? 8 : dbd; cmd[1] = dbd & 0x18; /* allows DBD and LLBA bits */ cmd[2] = modepage; cmd[3] = subpage; sshdr = exec_args.sshdr; retry: use_10_for_ms = sdev->use_10_for_ms || len > 255; if (use_10_for_ms) { if (len < 8 || len > 65535) return -EINVAL; cmd[0] = MODE_SENSE_10; put_unaligned_be16(len, &cmd[7]); header_length = 8; } else { if (len < 4) return -EINVAL; cmd[0] = MODE_SENSE; cmd[4] = len; header_length = 4; } memset(buffer, 0, len); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len, timeout, retries, &exec_args); if (result < 0) return result; /* This code looks awful: what it's doing is making sure an * ILLEGAL REQUEST sense return identifies the actual command * byte as the problem. MODE_SENSE commands can return * ILLEGAL REQUEST if the code page isn't supported */ if (!scsi_status_is_good(result)) { if (scsi_sense_valid(sshdr)) { if ((sshdr->sense_key == ILLEGAL_REQUEST) && (sshdr->asc == 0x20) && (sshdr->ascq == 0)) { /* * Invalid command operation code: retry using * MODE SENSE(6) if this was a MODE SENSE(10) * request, except if the request mode page is * too large for MODE SENSE single byte * allocation length field. */ if (use_10_for_ms) { if (len > 255) return -EIO; sdev->use_10_for_ms = 0; goto retry; } } if (scsi_status_is_check_condition(result) && sshdr->sense_key == UNIT_ATTENTION && retry_count) { retry_count--; goto retry; } } return -EIO; } if (unlikely(buffer[0] == 0x86 && buffer[1] == 0x0b && (modepage == 6 || modepage == 8))) { /* Initio breakage? */ header_length = 0; data->length = 13; data->medium_type = 0; data->device_specific = 0; data->longlba = 0; data->block_descriptor_length = 0; } else if (use_10_for_ms) { data->length = get_unaligned_be16(&buffer[0]) + 2; data->medium_type = buffer[2]; data->device_specific = buffer[3]; data->longlba = buffer[4] & 0x01; data->block_descriptor_length = get_unaligned_be16(&buffer[6]); } else { data->length = buffer[0] + 1; data->medium_type = buffer[1]; data->device_specific = buffer[2]; data->block_descriptor_length = buffer[3]; } data->header_length = header_length; return 0; } EXPORT_SYMBOL(scsi_mode_sense); /** * scsi_test_unit_ready - test if unit is ready * @sdev: scsi device to change the state of. * @timeout: command timeout * @retries: number of retries before failing * @sshdr: outpout pointer for decoded sense information. * * Returns zero if unsuccessful or an error if TUR failed. For * removable media, UNIT_ATTENTION sets ->changed flag. **/ int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr) { char cmd[] = { TEST_UNIT_READY, 0, 0, 0, 0, 0, }; const struct scsi_exec_args exec_args = { .sshdr = sshdr, }; int result; /* try to eat the UNIT_ATTENTION if there are enough retries */ do { result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, 1, &exec_args); if (sdev->removable && result > 0 && scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION) sdev->changed = 1; } while (result > 0 && scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION && --retries); return result; } EXPORT_SYMBOL(scsi_test_unit_ready); /** * scsi_device_set_state - Take the given device through the device state model. * @sdev: scsi device to change the state of. * @state: state to change to. * * Returns zero if successful or an error if the requested * transition is illegal. */ int scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state) { enum scsi_device_state oldstate = sdev->sdev_state; if (state == oldstate) return 0; switch (state) { case SDEV_CREATED: switch (oldstate) { case SDEV_CREATED_BLOCK: break; default: goto illegal; } break; case SDEV_RUNNING: switch (oldstate) { case SDEV_CREATED: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: case SDEV_QUIESCE: case SDEV_BLOCK: break; default: goto illegal; } break; case SDEV_QUIESCE: switch (oldstate) { case SDEV_RUNNING: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: break; default: goto illegal; } break; case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_QUIESCE: case SDEV_BLOCK: break; default: goto illegal; } break; case SDEV_BLOCK: switch (oldstate) { case SDEV_RUNNING: case SDEV_CREATED_BLOCK: case SDEV_QUIESCE: case SDEV_OFFLINE: break; default: goto illegal; } break; case SDEV_CREATED_BLOCK: switch (oldstate) { case SDEV_CREATED: break; default: goto illegal; } break; case SDEV_CANCEL: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_QUIESCE: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: break; default: goto illegal; } break; case SDEV_DEL: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: case SDEV_CANCEL: case SDEV_BLOCK: case SDEV_CREATED_BLOCK: break; default: goto illegal; } break; } sdev->offline_already = false; sdev->sdev_state = state; return 0; illegal: SCSI_LOG_ERROR_RECOVERY(1, sdev_printk(KERN_ERR, sdev, "Illegal state transition %s->%s", scsi_device_state_name(oldstate), scsi_device_state_name(state)) ); return -EINVAL; } EXPORT_SYMBOL(scsi_device_set_state); /** * scsi_evt_emit - emit a single SCSI device uevent * @sdev: associated SCSI device * @evt: event to emit * * Send a single uevent (scsi_event) to the associated scsi_device. */ static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt) { int idx = 0; char *envp[3]; switch (evt->evt_type) { case SDEV_EVT_MEDIA_CHANGE: envp[idx++] = "SDEV_MEDIA_CHANGE=1"; break; case SDEV_EVT_INQUIRY_CHANGE_REPORTED: scsi_rescan_device(sdev); envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED"; break; case SDEV_EVT_CAPACITY_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED"; break; case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED: envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED"; break; case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED"; break; case SDEV_EVT_LUN_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED"; break; case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED"; break; case SDEV_EVT_POWER_ON_RESET_OCCURRED: envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED"; break; default: /* do nothing */ break; } envp[idx++] = NULL; kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp); } /** * scsi_evt_thread - send a uevent for each scsi event * @work: work struct for scsi_device * * Dispatch queued events to their associated scsi_device kobjects * as uevents. */ void scsi_evt_thread(struct work_struct *work) { struct scsi_device *sdev; enum scsi_device_event evt_type; LIST_HEAD(event_list); sdev = container_of(work, struct scsi_device, event_work); for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++) if (test_and_clear_bit(evt_type, sdev->pending_events)) sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL); while (1) { struct scsi_event *evt; struct list_head *this, *tmp; unsigned long flags; spin_lock_irqsave(&sdev->list_lock, flags); list_splice_init(&sdev->event_list, &event_list); spin_unlock_irqrestore(&sdev->list_lock, flags); if (list_empty(&event_list)) break; list_for_each_safe(this, tmp, &event_list) { evt = list_entry(this, struct scsi_event, node); list_del(&evt->node); scsi_evt_emit(sdev, evt); kfree(evt); } } } /** * sdev_evt_send - send asserted event to uevent thread * @sdev: scsi_device event occurred on * @evt: event to send * * Assert scsi device event asynchronously. */ void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt) { unsigned long flags; #if 0 /* FIXME: currently this check eliminates all media change events * for polled devices. Need to update to discriminate between AN * and polled events */ if (!test_bit(evt->evt_type, sdev->supported_events)) { kfree(evt); return; } #endif spin_lock_irqsave(&sdev->list_lock, flags); list_add_tail(&evt->node, &sdev->event_list); schedule_work(&sdev->event_work); spin_unlock_irqrestore(&sdev->list_lock, flags); } EXPORT_SYMBOL_GPL(sdev_evt_send); /** * sdev_evt_alloc - allocate a new scsi event * @evt_type: type of event to allocate * @gfpflags: GFP flags for allocation * * Allocates and returns a new scsi_event. */ struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type, gfp_t gfpflags) { struct scsi_event *evt = kzalloc(sizeof(struct scsi_event), gfpflags); if (!evt) return NULL; evt->evt_type = evt_type; INIT_LIST_HEAD(&evt->node); /* evt_type-specific initialization, if any */ switch (evt_type) { case SDEV_EVT_MEDIA_CHANGE: case SDEV_EVT_INQUIRY_CHANGE_REPORTED: case SDEV_EVT_CAPACITY_CHANGE_REPORTED: case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED: case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED: case SDEV_EVT_LUN_CHANGE_REPORTED: case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED: case SDEV_EVT_POWER_ON_RESET_OCCURRED: default: /* do nothing */ break; } return evt; } EXPORT_SYMBOL_GPL(sdev_evt_alloc); /** * sdev_evt_send_simple - send asserted event to uevent thread * @sdev: scsi_device event occurred on * @evt_type: type of event to send * @gfpflags: GFP flags for allocation * * Assert scsi device event asynchronously, given an event type. */ void sdev_evt_send_simple(struct scsi_device *sdev, enum scsi_device_event evt_type, gfp_t gfpflags) { struct scsi_event *evt = sdev_evt_alloc(evt_type, gfpflags); if (!evt) { sdev_printk(KERN_ERR, sdev, "event %d eaten due to OOM\n", evt_type); return; } sdev_evt_send(sdev, evt); } EXPORT_SYMBOL_GPL(sdev_evt_send_simple); /** * scsi_device_quiesce - Block all commands except power management. * @sdev: scsi device to quiesce. * * This works by trying to transition to the SDEV_QUIESCE state * (which must be a legal transition). When the device is in this * state, only power management requests will be accepted, all others will * be deferred. * * Must be called with user context, may sleep. * * Returns zero if unsuccessful or an error if not. */ int scsi_device_quiesce(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; int err; /* * It is allowed to call scsi_device_quiesce() multiple times from * the same context but concurrent scsi_device_quiesce() calls are * not allowed. */ WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); if (sdev->quiesced_by == current) return 0; blk_set_pm_only(q); blk_mq_freeze_queue(q); /* * Ensure that the effect of blk_set_pm_only() will be visible * for percpu_ref_tryget() callers that occur after the queue * unfreeze even if the queue was already frozen before this function * was called. See also https://lwn.net/Articles/573497/. */ synchronize_rcu(); blk_mq_unfreeze_queue(q); mutex_lock(&sdev->state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE); if (err == 0) sdev->quiesced_by = current; else blk_clear_pm_only(q); mutex_unlock(&sdev->state_mutex); return err; } EXPORT_SYMBOL(scsi_device_quiesce); /** * scsi_device_resume - Restart user issued commands to a quiesced device. * @sdev: scsi device to resume. * * Moves the device from quiesced back to running and restarts the * queues. * * Must be called with user context, may sleep. */ void scsi_device_resume(struct scsi_device *sdev) { /* check if the device state was mutated prior to resume, and if * so assume the state is being managed elsewhere (for example * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); if (sdev->sdev_state == SDEV_QUIESCE) scsi_device_set_state(sdev, SDEV_RUNNING); if (sdev->quiesced_by) { sdev->quiesced_by = NULL; blk_clear_pm_only(sdev->request_queue); } mutex_unlock(&sdev->state_mutex); } EXPORT_SYMBOL(scsi_device_resume); static void device_quiesce_fn(struct scsi_device *sdev, void *data) { scsi_device_quiesce(sdev); } void scsi_target_quiesce(struct scsi_target *starget) { starget_for_each_device(starget, NULL, device_quiesce_fn); } EXPORT_SYMBOL(scsi_target_quiesce); static void device_resume_fn(struct scsi_device *sdev, void *data) { scsi_device_resume(sdev); } void scsi_target_resume(struct scsi_target *starget) { starget_for_each_device(starget, NULL, device_resume_fn); } EXPORT_SYMBOL(scsi_target_resume); static int __scsi_internal_device_block_nowait(struct scsi_device *sdev) { if (scsi_device_set_state(sdev, SDEV_BLOCK)) return scsi_device_set_state(sdev, SDEV_CREATED_BLOCK); return 0; } void scsi_start_queue(struct scsi_device *sdev) { if (cmpxchg(&sdev->queue_stopped, 1, 0)) blk_mq_unquiesce_queue(sdev->request_queue); } static void scsi_stop_queue(struct scsi_device *sdev) { /* * The atomic variable of ->queue_stopped covers that * blk_mq_quiesce_queue* is balanced with blk_mq_unquiesce_queue. * * The caller needs to wait until quiesce is done. */ if (!cmpxchg(&sdev->queue_stopped, 0, 1)) blk_mq_quiesce_queue_nowait(sdev->request_queue); } /** * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state * @sdev: device to block * * Pause SCSI command processing on the specified device. Does not sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_BLOCK state (which must be * a legal transition). When the device is in this state, command processing * is paused until the device leaves the SDEV_BLOCK state. See also * scsi_internal_device_unblock_nowait(). */ int scsi_internal_device_block_nowait(struct scsi_device *sdev) { int ret = __scsi_internal_device_block_nowait(sdev); /* * The device has transitioned to SDEV_BLOCK. Stop the * block layer from calling the midlayer with this device's * request queue. */ if (!ret) scsi_stop_queue(sdev); return ret; } EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); /** * scsi_device_block - try to transition to the SDEV_BLOCK state * @sdev: device to block * @data: dummy argument, ignored * * Pause SCSI command processing on the specified device. Callers must wait * until all ongoing scsi_queue_rq() calls have finished after this function * returns. * * Note: * This routine transitions the device to the SDEV_BLOCK state (which must be * a legal transition). When the device is in this state, command processing * is paused until the device leaves the SDEV_BLOCK state. See also * scsi_internal_device_unblock(). */ static void scsi_device_block(struct scsi_device *sdev, void *data) { int err; enum scsi_device_state state; mutex_lock(&sdev->state_mutex); err = __scsi_internal_device_block_nowait(sdev); state = sdev->sdev_state; if (err == 0) /* * scsi_stop_queue() must be called with the state_mutex * held. Otherwise a simultaneous scsi_start_queue() call * might unquiesce the queue before we quiesce it. */ scsi_stop_queue(sdev); mutex_unlock(&sdev->state_mutex); WARN_ONCE(err, "%s: failed to block %s in state %d\n", __func__, dev_name(&sdev->sdev_gendev), state); } /** * scsi_internal_device_unblock_nowait - resume a device after a block request * @sdev: device to resume * @new_state: state to set the device to after unblocking * * Restart the device queue for a previously suspended SCSI device. Does not * sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_RUNNING state or to one of * the offline states (which must be a legal transition) allowing the midlayer * to goose the queue for this device. */ int scsi_internal_device_unblock_nowait(struct scsi_device *sdev, enum scsi_device_state new_state) { switch (new_state) { case SDEV_RUNNING: case SDEV_TRANSPORT_OFFLINE: break; default: return -EINVAL; } /* * Try to transition the scsi device to SDEV_RUNNING or one of the * offlined states and goose the device queue if successful. */ switch (sdev->sdev_state) { case SDEV_BLOCK: case SDEV_TRANSPORT_OFFLINE: sdev->sdev_state = new_state; break; case SDEV_CREATED_BLOCK: if (new_state == SDEV_TRANSPORT_OFFLINE || new_state == SDEV_OFFLINE) sdev->sdev_state = new_state; else sdev->sdev_state = SDEV_CREATED; break; case SDEV_CANCEL: case SDEV_OFFLINE: break; default: return -EINVAL; } scsi_start_queue(sdev); return 0; } EXPORT_SYMBOL_GPL(scsi_internal_device_unblock_nowait); /** * scsi_internal_device_unblock - resume a device after a block request * @sdev: device to resume * @new_state: state to set the device to after unblocking * * Restart the device queue for a previously suspended SCSI device. May sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_RUNNING state or to one of * the offline states (which must be a legal transition) allowing the midlayer * to goose the queue for this device. */ static int scsi_internal_device_unblock(struct scsi_device *sdev, enum scsi_device_state new_state) { int ret; mutex_lock(&sdev->state_mutex); ret = scsi_internal_device_unblock_nowait(sdev, new_state); mutex_unlock(&sdev->state_mutex); return ret; } static int target_block(struct device *dev, void *data) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), NULL, scsi_device_block); return 0; } /** * scsi_block_targets - transition all SCSI child devices to SDEV_BLOCK state * @dev: a parent device of one or more scsi_target devices * @shost: the Scsi_Host to which this device belongs * * Iterate over all children of @dev, which should be scsi_target devices, * and switch all subordinate scsi devices to SDEV_BLOCK state. Wait for * ongoing scsi_queue_rq() calls to finish. May sleep. * * Note: * @dev must not itself be a scsi_target device. */ void scsi_block_targets(struct Scsi_Host *shost, struct device *dev) { WARN_ON_ONCE(scsi_is_target_device(dev)); device_for_each_child(dev, NULL, target_block); blk_mq_wait_quiesce_done(&shost->tag_set); } EXPORT_SYMBOL_GPL(scsi_block_targets); static void device_unblock(struct scsi_device *sdev, void *data) { scsi_internal_device_unblock(sdev, *(enum scsi_device_state *)data); } static int target_unblock(struct device *dev, void *data) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), data, device_unblock); return 0; } void scsi_target_unblock(struct device *dev, enum scsi_device_state new_state) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), &new_state, device_unblock); else device_for_each_child(dev, &new_state, target_unblock); } EXPORT_SYMBOL_GPL(scsi_target_unblock); /** * scsi_host_block - Try to transition all logical units to the SDEV_BLOCK state * @shost: device to block * * Pause SCSI command processing for all logical units associated with the SCSI * host and wait until pending scsi_queue_rq() calls have finished. * * Returns zero if successful or a negative error code upon failure. */ int scsi_host_block(struct Scsi_Host *shost) { struct scsi_device *sdev; int ret; /* * Call scsi_internal_device_block_nowait so we can avoid * calling synchronize_rcu() for each LUN. */ shost_for_each_device(sdev, shost) { mutex_lock(&sdev->state_mutex); ret = scsi_internal_device_block_nowait(sdev); mutex_unlock(&sdev->state_mutex); if (ret) { scsi_device_put(sdev); return ret; } } /* Wait for ongoing scsi_queue_rq() calls to finish. */ blk_mq_wait_quiesce_done(&shost->tag_set); return 0; } EXPORT_SYMBOL_GPL(scsi_host_block); int scsi_host_unblock(struct Scsi_Host *shost, int new_state) { struct scsi_device *sdev; int ret = 0; shost_for_each_device(sdev, shost) { ret = scsi_internal_device_unblock(sdev, new_state); if (ret) { scsi_device_put(sdev); break; } } return ret; } EXPORT_SYMBOL_GPL(scsi_host_unblock); /** * scsi_kmap_atomic_sg - find and atomically map an sg-elemnt * @sgl: scatter-gather list * @sg_count: number of segments in sg * @offset: offset in bytes into sg, on return offset into the mapped area * @len: bytes to map, on return number of bytes mapped * * Returns virtual address of the start of the mapped page */ void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count, size_t *offset, size_t *len) { int i; size_t sg_len = 0, len_complete = 0; struct scatterlist *sg; struct page *page; WARN_ON(!irqs_disabled()); for_each_sg(sgl, sg, sg_count, i) { len_complete = sg_len; /* Complete sg-entries */ sg_len += sg->length; if (sg_len > *offset) break; } if (unlikely(i == sg_count)) { printk(KERN_ERR "%s: Bytes in sg: %zu, requested offset %zu, " "elements %d\n", __func__, sg_len, *offset, sg_count); WARN_ON(1); return NULL; } /* Offset starting from the beginning of first page in this sg-entry */ *offset = *offset - len_complete + sg->offset; /* Assumption: contiguous pages can be accessed as "page + i" */ page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT)); *offset &= ~PAGE_MASK; /* Bytes in this sg-entry from *offset to the end of the page */ sg_len = PAGE_SIZE - *offset; if (*len > sg_len) *len = sg_len; return kmap_atomic(page); } EXPORT_SYMBOL(scsi_kmap_atomic_sg); /** * scsi_kunmap_atomic_sg - atomically unmap a virtual address, previously mapped with scsi_kmap_atomic_sg * @virt: virtual address to be unmapped */ void scsi_kunmap_atomic_sg(void *virt) { kunmap_atomic(virt); } EXPORT_SYMBOL(scsi_kunmap_atomic_sg); void sdev_disable_disk_events(struct scsi_device *sdev) { atomic_inc(&sdev->disk_events_disable_depth); } EXPORT_SYMBOL(sdev_disable_disk_events); void sdev_enable_disk_events(struct scsi_device *sdev) { if (WARN_ON_ONCE(atomic_read(&sdev->disk_events_disable_depth) <= 0)) return; atomic_dec(&sdev->disk_events_disable_depth); } EXPORT_SYMBOL(sdev_enable_disk_events); static unsigned char designator_prio(const unsigned char *d) { if (d[1] & 0x30) /* not associated with LUN */ return 0; if (d[3] == 0) /* invalid length */ return 0; /* * Order of preference for lun descriptor: * - SCSI name string * - NAA IEEE Registered Extended * - EUI-64 based 16-byte * - EUI-64 based 12-byte * - NAA IEEE Registered * - NAA IEEE Extended * - EUI-64 based 8-byte * - SCSI name string (truncated) * - T10 Vendor ID * as longer descriptors reduce the likelyhood * of identification clashes. */ switch (d[1] & 0xf) { case 8: /* SCSI name string, variable-length UTF-8 */ return 9; case 3: switch (d[4] >> 4) { case 6: /* NAA registered extended */ return 8; case 5: /* NAA registered */ return 5; case 4: /* NAA extended */ return 4; case 3: /* NAA locally assigned */ return 1; default: break; } break; case 2: switch (d[3]) { case 16: /* EUI64-based, 16 byte */ return 7; case 12: /* EUI64-based, 12 byte */ return 6; case 8: /* EUI64-based, 8 byte */ return 3; default: break; } break; case 1: /* T10 vendor ID */ return 1; default: break; } return 0; } /** * scsi_vpd_lun_id - return a unique device identification * @sdev: SCSI device * @id: buffer for the identification * @id_len: length of the buffer * * Copies a unique device identification into @id based * on the information in the VPD page 0x83 of the device. * The string will be formatted as a SCSI name string. * * Returns the length of the identification or error on failure. * If the identifier is longer than the supplied buffer the actual * identifier length is returned and the buffer is not zero-padded. */ int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len) { u8 cur_id_prio = 0; u8 cur_id_size = 0; const unsigned char *d, *cur_id_str; const struct scsi_vpd *vpd_pg83; int id_size = -EINVAL; rcu_read_lock(); vpd_pg83 = rcu_dereference(sdev->vpd_pg83); if (!vpd_pg83) { rcu_read_unlock(); return -ENXIO; } /* The id string must be at least 20 bytes + terminating NULL byte */ if (id_len < 21) { rcu_read_unlock(); return -EINVAL; } memset(id, 0, id_len); for (d = vpd_pg83->data + 4; d < vpd_pg83->data + vpd_pg83->len; d += d[3] + 4) { u8 prio = designator_prio(d); if (prio == 0 || cur_id_prio > prio) continue; switch (d[1] & 0xf) { case 0x1: /* T10 Vendor ID */ if (cur_id_size > d[3]) break; cur_id_prio = prio; cur_id_size = d[3]; if (cur_id_size + 4 > id_len) cur_id_size = id_len - 4; cur_id_str = d + 4; id_size = snprintf(id, id_len, "t10.%*pE", cur_id_size, cur_id_str); break; case 0x2: /* EUI-64 */ cur_id_prio = prio; cur_id_size = d[3]; cur_id_str = d + 4; switch (cur_id_size) { case 8: id_size = snprintf(id, id_len, "eui.%8phN", cur_id_str); break; case 12: id_size = snprintf(id, id_len, "eui.%12phN", cur_id_str); break; case 16: id_size = snprintf(id, id_len, "eui.%16phN", cur_id_str); break; default: break; } break; case 0x3: /* NAA */ cur_id_prio = prio; cur_id_size = d[3]; cur_id_str = d + 4; switch (cur_id_size) { case 8: id_size = snprintf(id, id_len, "naa.%8phN", cur_id_str); break; case 16: id_size = snprintf(id, id_len, "naa.%16phN", cur_id_str); break; default: break; } break; case 0x8: /* SCSI name string */ if (cur_id_size > d[3]) break; /* Prefer others for truncated descriptor */ if (d[3] > id_len) { prio = 2; if (cur_id_prio > prio) break; } cur_id_prio = prio; cur_id_size = id_size = d[3]; cur_id_str = d + 4; if (cur_id_size >= id_len) cur_id_size = id_len - 1; memcpy(id, cur_id_str, cur_id_size); break; default: break; } } rcu_read_unlock(); return id_size; } EXPORT_SYMBOL(scsi_vpd_lun_id); /* * scsi_vpd_tpg_id - return a target port group identifier * @sdev: SCSI device * * Returns the Target Port Group identifier from the information * froom VPD page 0x83 of the device. * * Returns the identifier or error on failure. */ int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id) { const unsigned char *d; const struct scsi_vpd *vpd_pg83; int group_id = -EAGAIN, rel_port = -1; rcu_read_lock(); vpd_pg83 = rcu_dereference(sdev->vpd_pg83); if (!vpd_pg83) { rcu_read_unlock(); return -ENXIO; } d = vpd_pg83->data + 4; while (d < vpd_pg83->data + vpd_pg83->len) { switch (d[1] & 0xf) { case 0x4: /* Relative target port */ rel_port = get_unaligned_be16(&d[6]); break; case 0x5: /* Target port group */ group_id = get_unaligned_be16(&d[6]); break; default: break; } d += d[3] + 4; } rcu_read_unlock(); if (group_id >= 0 && rel_id && rel_port != -1) *rel_id = rel_port; return group_id; } EXPORT_SYMBOL(scsi_vpd_tpg_id); /** * scsi_build_sense - build sense data for a command * @scmd: scsi command for which the sense should be formatted * @desc: Sense format (non-zero == descriptor format, * 0 == fixed format) * @key: Sense key * @asc: Additional sense code * @ascq: Additional sense code qualifier * **/ void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq) { scsi_build_sense_buffer(desc, scmd->sense_buffer, key, asc, ascq); scmd->result = SAM_STAT_CHECK_CONDITION; } EXPORT_SYMBOL_GPL(scsi_build_sense); |
144 144 144 142 142 144 144 144 144 144 142 4 4 4 4 142 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> * Copyright (C) IBM Corporation, 2009 * Thanks to Thomas Gleixner for conceptual design and careful reviews. * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" */ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/plist.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they * reside in the same cacheline. */ static struct { struct futex_hash_bucket *queues; unsigned long hashsize; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) #define futex_hashsize (__futex_data.hashsize) /* * Fault injections for futexes. */ #ifdef CONFIG_FAIL_FUTEX static struct { struct fault_attr attr; bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, .ignore_private = false, }; static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); } __setup("fail_futex=", setup_fail_futex); bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private); return 0; } late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #endif /* CONFIG_FAIL_FUTEX */ /** * futex_hash - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket in the global hash. */ struct futex_hash_bucket *futex_hash(union futex_key *key) { u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); return &futex_queues[hash & (futex_hashsize - 1)]; } /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value * @timeout: the hrtimer_sleeper structure to be set up * @flags: futex flags * @range_ns: optional range in ns * * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns) { if (!time) return NULL; hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). */ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); return timeout; } /* * Generate a machine wide unique identifier for this inode. * * This relies on u64 not wrapping in the life-time of the machine; which with * 1ns resolution means almost 585 years. * * This further relies on the fact that a well formed program will not unmap * the file while it has a (shared) futex waiting on it. This mapping will have * a file reference which pins the mount and inode. * * If for some reason an inode gets evicted and read back in again, it will get * a new sequence number and will _NOT_ match, even though it is the exact same * file. * * It is important that futex_match() will never have a false-positive, esp. * for PI futexes that can mess up the state. The above argues that false-negatives * are only possible for malformed programs. */ static u64 get_inode_sequence_number(struct inode *inode) { static atomic64_t i_seq; u64 old; /* Does the inode already have a sequence number? */ old = atomic64_read(&inode->i_sequence); if (likely(old)) return old; for (;;) { u64 new = atomic64_add_return(1, &i_seq); if (WARN_ON_ONCE(!new)) continue; old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); if (old) return old; return new; } } /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: * * ( inode->i_sequence, page->index, offset_within_page ) * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page; struct folio *folio; struct address_space *mapping; int err, ro = 0; bool fshared; fshared = flags & FLAGS_SHARED; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(uaddr, sizeof(u32)))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { /* * On no-MMU, shared futexes are treated as private, therefore * we must not include the current process in the key. Since * there is only one address space, the address is a unique key * on its own. */ if (IS_ENABLED(CONFIG_MMU)) key->private.mm = mm; else key->private.mm = NULL; key->private.address = address; return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The folio * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the folio lock is not needed in all cases being * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and * folio lock will be acquired only if it is unavoidable * * Mapping checks require the folio so it is looked up now. For * anonymous pages, it does not matter if the folio is split * in the future as the key is based on the address. For * filesystem-backed pages, the precise page is required as the * index of the page determines the key. */ folio = page_folio(page); mapping = READ_ONCE(folio->mapping); /* * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Folio lock is required to identify which special case above * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ folio_lock(folio); shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; folio_unlock(folio); folio_put(folio); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the folio->mapping must be traversed. Ordinarily this should * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); folio_put(folio); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = folio->index + folio_page_idx(folio, page); rcu_read_unlock(); } out: folio_put(folio); return err; } /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */ int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; mmap_read_lock(mm); ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (futex_match(&this->key, key)) return this; } return NULL; } int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; } int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; } /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. */ void wait_for_owner_exiting(int ret, struct task_struct *exiting) { if (ret != -EBUSY) { WARN_ON_ONCE(exiting); return; } if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; mutex_lock(&exiting->futex_exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can * have any FUTEX_STATE_* value when the waiter has acquired the * mutex. OK, if running, EXITING or DEAD if it reached exit() * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ mutex_unlock(&exiting->futex_exit_mutex); put_task_struct(exiting); } /** * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be NULL and must be held by the caller. */ void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); } /* The key must be already stored in q->key. */ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) __acquires(&hb->lock) { struct futex_hash_bucket *hb; hb = futex_hash(&q->key); /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is * waiting for the spinlock. This is safe as all futex_q_lock() * users end up calling futex_queue(). Similarly, for housekeeping, * decrement the counter at futex_q_unlock() when some error has * occurred and we don't end up adding the task to the list. */ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; spin_lock(&hb->lock); return hb; } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); futex_hb_waiters_dec(hb); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; /* * The priority used to register this element is * - either the real thread-priority for the real-time threads * (i.e. threads with a priority lower than MAX_RT_PRIO) * - or MAX_RT_PRIO for non-RT threads. * Thus, all RT-threads are woken first in priority order, and * the others are woken last, in FIFO order. */ prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; } /** * futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must * be paired with exactly one earlier call to futex_queue(). * * Return: * - 1 - if the futex_q was still queued (and we removed unqueued it); * - 0 - if the futex_q was already removed by the waking thread */ int futex_unqueue(struct futex_q *q) { spinlock_t *lock_ptr; int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: /* * q->lock_ptr can change between this read and the following spin_lock. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and * optimizing lock_ptr out of the logic below. */ lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. * * Reasoning goes like this: if we have the wrong lock, * q->lock_ptr must have changed (maybe several times) * between reading it and the spin_lock(). It can * change again after the spin_lock() but only if it was * already changed before the spin_lock(). It cannot, * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } __futex_unqueue(q); BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; } return ret; } /* * PI futexes can not be requeued and must remove themselves from the * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; } /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; pid_t owner; int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) return -1; retry: if (get_user(uval, uaddr)) return -1; /* * Special case for regular (non PI) futexes. The unlock path in * user space has two race scenarios: * * 1. The unlock path releases the user space futex value and * before it can execute the futex() syscall to wake up * waiters it is killed. * * 2. A woken up waiter is killed before it can acquire the * futex in user space. * * In the second case, the wake up notification could be generated * by the unlock path in user space after setting the futex value * to zero or by the kernel after setting the OWNER_DIED bit below. * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. * * In both cases the following conditions are met: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and * trying to set the OWNER_DIED bit. If the futex value is zero, * the rest of the user space mutex state is consistent, so a woken * waiter will just take over the uncontended futex. Setting the * OWNER_DIED bit would create inconsistent state and malfunction * of the user space owner died handling. Otherwise, the OWNER_DIED * bit is already set, and the woken waiter is expected to deal with * this. */ owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); return 0; } if (owner != task_pid_vnr(curr)) return 0; /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically * via cmpxchg, and if the value had FUTEX_WAITERS * set, wake up a waiter (if any). (We have to do a * futex_wake() even if OWNER_DIED is already set - * to handle the rare but possible case of recursive * thread-death.) The rest of the cleanup is done in * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; /* * We are not holding a lock here, but we want to have * the pagefault_disable/enable() protection because * we want to handle the fault gracefully. If the * access fails we try to fault in the futex with R/W * verification via get_user_pages. get_user() above * does not guarantee R/W access. If that fails we * give up and leave the futex locked. */ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { switch (err) { case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; case -EAGAIN: cond_resched(); goto retry; default: WARN_ON_ONCE(1); return err; } } if (nval != uval) goto retry; /* * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, unsigned int *pi) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; unsigned long futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { handle_futex_death((void __user *)pending + futex_offset, curr, pip, HANDLE_DEATH_PENDING); } } #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); void __user *uaddr = compat_ptr(base + futex_offset); return uaddr; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; *entry = compat_ptr((*uentry) & ~1); *pi = (unsigned int)(*uentry) & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); if (handle_futex_death(uaddr, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } #endif #ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = futex_hash(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); } #else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); } /** * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD * @tsk: task to set the state on * * Set the futex exit state of the task lockless. The futex waiter code * observes that state when a task is exiting and loops until the task has * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can * take it over. If not, the problem is pushed back to user space. If the * futex exit code did not run yet, then an already queued waiter might * block forever, but there is nothing which can be done about that. */ void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ if (tsk->futex_state == FUTEX_STATE_EXITING) mutex_unlock(&tsk->futex_exit_mutex); tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ mutex_lock(&tsk->futex_exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * * This ensures that all subsequent checks of tsk->futex_state in * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with * tsk->pi_lock held. * * It guarantees also that a pi_state which was queued right before * the state change under tsk->pi_lock by a concurrent waiter must * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ tsk->futex_state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ futex_cleanup_begin(tsk); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about * exec a new binary. */ futex_cleanup_end(tsk, FUTEX_STATE_OK); } void futex_exit_release(struct task_struct *tsk) { futex_cleanup_begin(tsk); futex_cleanup(tsk); futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } static int __init futex_init(void) { unsigned int futex_shift; unsigned long i; #if CONFIG_BASE_SMALL futex_hashsize = 16; #else futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), futex_hashsize, 0, 0, &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; for (i = 0; i < futex_hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } return 0; } core_initcall(futex_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | // SPDX-License-Identifier: GPL-2.0 /* * Creating audit events from TTY input. * * Copyright (C) 2007 Red Hat, Inc. All rights reserved. * * Authors: Miloslav Trmac <mitr@redhat.com> */ #include <linux/audit.h> #include <linux/slab.h> #include <linux/tty.h> #include "tty.h" struct tty_audit_buf { struct mutex mutex; /* Protects all data below */ dev_t dev; /* The TTY which the data is from */ bool icanon; size_t valid; u8 *data; /* Allocated size N_TTY_BUF_SIZE */ }; static struct tty_audit_buf *tty_audit_buf_ref(void) { struct tty_audit_buf *buf; buf = current->signal->tty_audit_buf; WARN_ON(buf == ERR_PTR(-ESRCH)); return buf; } static struct tty_audit_buf *tty_audit_buf_alloc(void) { struct tty_audit_buf *buf; buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) goto err; buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL); if (!buf->data) goto err_buf; mutex_init(&buf->mutex); return buf; err_buf: kfree(buf); err: return NULL; } static void tty_audit_buf_free(struct tty_audit_buf *buf) { WARN_ON(buf->valid != 0); kfree(buf->data); kfree(buf); } static void tty_audit_log(const char *description, dev_t dev, const u8 *data, size_t size) { struct audit_buffer *ab; pid_t pid = task_pid_nr(current); uid_t uid = from_kuid(&init_user_ns, task_uid(current)); uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); unsigned int sessionid = audit_get_sessionid(current); char name[TASK_COMM_LEN]; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_TTY); if (!ab) return; audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d minor=%d comm=", description, pid, uid, loginuid, sessionid, MAJOR(dev), MINOR(dev)); get_task_comm(name, current); audit_log_untrustedstring(ab, name); audit_log_format(ab, " data="); audit_log_n_hex(ab, data, size); audit_log_end(ab); } /* * tty_audit_buf_push - Push buffered data out * * Generate an audit message from the contents of @buf, which is owned by * the current task. @buf->mutex must be locked. */ static void tty_audit_buf_push(struct tty_audit_buf *buf) { if (buf->valid == 0) return; if (audit_enabled == AUDIT_OFF) { buf->valid = 0; return; } tty_audit_log("tty", buf->dev, buf->data, buf->valid); buf->valid = 0; } /** * tty_audit_exit - Handle a task exit * * Make sure all buffered data is written out and deallocate the buffer. * Only needs to be called if current->signal->tty_audit_buf != %NULL. * * The process is single-threaded at this point; no other threads share * current->signal. */ void tty_audit_exit(void) { struct tty_audit_buf *buf; buf = xchg(¤t->signal->tty_audit_buf, ERR_PTR(-ESRCH)); if (!buf) return; tty_audit_buf_push(buf); tty_audit_buf_free(buf); } /* * tty_audit_fork - Copy TTY audit state for a new task * * Set up TTY audit state in @sig from current. @sig needs no locking. */ void tty_audit_fork(struct signal_struct *sig) { sig->audit_tty = current->signal->audit_tty; } /* * tty_audit_tiocsti - Log TIOCSTI */ void tty_audit_tiocsti(const struct tty_struct *tty, u8 ch) { dev_t dev; dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (tty_audit_push()) return; if (audit_enabled) tty_audit_log("ioctl=TIOCSTI", dev, &ch, 1); } /* * tty_audit_push - Flush current's pending audit data * * Returns 0 if success, -EPERM if tty audit is disabled */ int tty_audit_push(void) { struct tty_audit_buf *buf; if (~current->signal->audit_tty & AUDIT_TTY_ENABLE) return -EPERM; buf = tty_audit_buf_ref(); if (!IS_ERR_OR_NULL(buf)) { mutex_lock(&buf->mutex); tty_audit_buf_push(buf); mutex_unlock(&buf->mutex); } return 0; } /* * tty_audit_buf_get - Get an audit buffer. * * Get an audit buffer, allocate it if necessary. Return %NULL * if out of memory or ERR_PTR(-ESRCH) if tty_audit_exit() has already * occurred. Otherwise, return a new reference to the buffer. */ static struct tty_audit_buf *tty_audit_buf_get(void) { struct tty_audit_buf *buf; buf = tty_audit_buf_ref(); if (buf) return buf; buf = tty_audit_buf_alloc(); if (buf == NULL) { audit_log_lost("out of memory in TTY auditing"); return NULL; } /* Race to use this buffer, free it if another wins */ if (cmpxchg(¤t->signal->tty_audit_buf, NULL, buf) != NULL) tty_audit_buf_free(buf); return tty_audit_buf_ref(); } /* * tty_audit_add_data - Add data for TTY auditing. * * Audit @data of @size from @tty, if necessary. */ void tty_audit_add_data(const struct tty_struct *tty, const void *data, size_t size) { struct tty_audit_buf *buf; unsigned int audit_tty; bool icanon = L_ICANON(tty); dev_t dev; audit_tty = READ_ONCE(current->signal->audit_tty); if (~audit_tty & AUDIT_TTY_ENABLE) return; if (unlikely(size == 0)) return; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) return; if ((~audit_tty & AUDIT_TTY_LOG_PASSWD) && icanon && !L_ECHO(tty)) return; buf = tty_audit_buf_get(); if (IS_ERR_OR_NULL(buf)) return; mutex_lock(&buf->mutex); dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (buf->dev != dev || buf->icanon != icanon) { tty_audit_buf_push(buf); buf->dev = dev; buf->icanon = icanon; } do { size_t run; run = N_TTY_BUF_SIZE - buf->valid; if (run > size) run = size; memcpy(buf->data + buf->valid, data, run); buf->valid += run; data += run; size -= run; if (buf->valid == N_TTY_BUF_SIZE) tty_audit_buf_push(buf); } while (size != 0); mutex_unlock(&buf->mutex); } |
45 10 45 45 45 45 24 7 24 24 24 91 91 91 124 113 109 111 110 109 128 128 112 109 88 24 23 57 9 15 20 2 44 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> * * Authors: Alan Stern <stern@rowland.harvard.edu> * K.Prasad <prasad@linux.vnet.ibm.com> * Frederic Weisbecker <fweisbec@gmail.com> */ /* * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, * using the CPU's debug registers. */ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/irqflags.h> #include <linux/notifier.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/percpu.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/smp.h> #include <asm/hw_breakpoint.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/user.h> #include <asm/desc.h> #include <asm/tlbflush.h> /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); EXPORT_PER_CPU_SYMBOL(cpu_dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); /* * Stores the breakpoints currently in use on each breakpoint address * register for each cpus */ static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); static inline unsigned long __encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; bp_info = (len | type) & 0xf; bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); return bp_info; } /* * Encode the length, type, Exact, and Enable bits for a particular breakpoint * as stored in debug register 7. */ unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) { return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; } /* * Decode the length and type bits for a particular breakpoint as * stored in debug register 7. Return the "enabled" status. */ int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) { int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); *len = (bp_info & 0xc) | 0x40; *type = (bp_info & 0x3) | 0x80; return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } /* * Install a perf counter breakpoint. * * We seek a free debug address register and use it for this * breakpoint. Eventually we enable it in the debug control register. * * Atomic: we hold the counter->ctx->lock and we only handle variables * and registers local to this cpu. */ int arch_install_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned long *dr7; int i; lockdep_assert_irqs_disabled(); for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (!*slot) { *slot = bp; break; } } if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return -EBUSY; set_debugreg(info->address, i); __this_cpu_write(cpu_debugreg[i], info->address); dr7 = this_cpu_ptr(&cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); /* * Ensure we first write cpu_dr7 before we set the DR7 register. * This ensures an NMI never see cpu_dr7 0 when DR7 is not. */ barrier(); set_debugreg(*dr7, 7); if (info->mask) amd_set_dr_addr_mask(info->mask, i); return 0; } /* * Uninstall the breakpoint contained in the given counter. * * First we search the debug address register it uses and then we disable * it. * * Atomic: we hold the counter->ctx->lock and we only handle variables * and registers local to this cpu. */ void arch_uninstall_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned long dr7; int i; lockdep_assert_irqs_disabled(); for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (*slot == bp) { *slot = NULL; break; } } if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; dr7 = this_cpu_read(cpu_dr7); dr7 &= ~__encode_dr7(i, info->len, info->type); set_debugreg(dr7, 7); if (info->mask) amd_set_dr_addr_mask(0, i); /* * Ensure the write to cpu_dr7 is after we've set the DR7 register. * This ensures an NMI never see cpu_dr7 0 when DR7 is not. */ barrier(); this_cpu_write(cpu_dr7, dr7); } static int arch_bp_generic_len(int x86_len) { switch (x86_len) { case X86_BREAKPOINT_LEN_1: return HW_BREAKPOINT_LEN_1; case X86_BREAKPOINT_LEN_2: return HW_BREAKPOINT_LEN_2; case X86_BREAKPOINT_LEN_4: return HW_BREAKPOINT_LEN_4; #ifdef CONFIG_X86_64 case X86_BREAKPOINT_LEN_8: return HW_BREAKPOINT_LEN_8; #endif default: return -EINVAL; } } int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { int len; /* Type */ switch (x86_type) { case X86_BREAKPOINT_EXECUTE: if (x86_len != X86_BREAKPOINT_LEN_X) return -EINVAL; *gen_type = HW_BREAKPOINT_X; *gen_len = sizeof(long); return 0; case X86_BREAKPOINT_WRITE: *gen_type = HW_BREAKPOINT_W; break; case X86_BREAKPOINT_RW: *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; break; default: return -EINVAL; } /* Len */ len = arch_bp_generic_len(x86_len); if (len < 0) return -EINVAL; *gen_len = len; return 0; } /* * Check for virtual address in kernel space. */ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { unsigned long va; int len; va = hw->address; len = arch_bp_generic_len(hw->len); WARN_ON_ONCE(len < 0); /* * We don't need to worry about va + len - 1 overflowing: * we already require that va is aligned to a multiple of len. */ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } /* * Checks whether the range [addr, end], overlaps the area [base, base + size). */ static inline bool within_area(unsigned long addr, unsigned long end, unsigned long base, unsigned long size) { return end >= base && addr < (base + size); } /* * Checks whether the range from addr to end, inclusive, overlaps the fixed * mapped CPU entry area range or other ranges used for CPU entry. */ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) { int cpu; /* CPU entry erea is always used for CPU entry */ if (within_area(addr, end, CPU_ENTRY_AREA_BASE, CPU_ENTRY_AREA_MAP_SIZE)) return true; /* * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. */ #ifdef CONFIG_SMP if (within_area(addr, end, (unsigned long)__per_cpu_offset, sizeof(unsigned long) * nr_cpu_ids)) return true; #else if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, sizeof(pcpu_unit_offsets))) return true; #endif for_each_possible_cpu(cpu) { /* The original rw GDT is being used after load_direct_gdt() */ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), GDT_SIZE)) return true; /* * cpu_tss_rw is not directly referenced by hardware, but * cpu_tss_rw is also used in CPU entry code, */ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct))) return true; /* * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. * If a data breakpoint on it, it will cause an unwanted #DB. * Protect the full cpu_tlbstate structure to be sure. */ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_tlbstate, cpu), sizeof(struct tlb_state))) return true; /* * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() * will read per-cpu cpu_dr7 before clear dr7 register. */ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), sizeof(cpu_dr7))) return true; } return false; } static int arch_build_bp_info(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { unsigned long bp_end; bp_end = attr->bp_addr + attr->bp_len - 1; if (bp_end < attr->bp_addr) return -EINVAL; /* * Prevent any breakpoint of any type that overlaps the CPU * entry area and data. This protects the IST stacks and also * reduces the chance that we ever find out what happens if * there's a data breakpoint on the GDT, IDT, or TSS. */ if (within_cpu_entry(attr->bp_addr, bp_end)) return -EINVAL; hw->address = attr->bp_addr; hw->mask = 0; /* Type */ switch (attr->bp_type) { case HW_BREAKPOINT_W: hw->type = X86_BREAKPOINT_WRITE; break; case HW_BREAKPOINT_W | HW_BREAKPOINT_R: hw->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: /* * We don't allow kernel breakpoints in places that are not * acceptable for kprobes. On non-kprobes kernels, we don't * allow kernel breakpoints at all. */ if (attr->bp_addr >= TASK_SIZE_MAX) { if (within_kprobe_blacklist(attr->bp_addr)) return -EINVAL; } hw->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. * But we still need to check userspace is not trying to setup * an unsupported length, to get a range breakpoint for example. */ if (attr->bp_len == sizeof(long)) { hw->len = X86_BREAKPOINT_LEN_X; return 0; } fallthrough; default: return -EINVAL; } /* Len */ switch (attr->bp_len) { case HW_BREAKPOINT_LEN_1: hw->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: hw->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: hw->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: hw->len = X86_BREAKPOINT_LEN_8; break; #endif default: /* AMD range breakpoint */ if (!is_power_of_2(attr->bp_len)) return -EINVAL; if (attr->bp_addr & (attr->bp_len - 1)) return -EINVAL; if (!boot_cpu_has(X86_FEATURE_BPEXT)) return -EOPNOTSUPP; /* * It's impossible to use a range breakpoint to fake out * user vs kernel detection because bp_len - 1 can't * have the high bit set. If we ever allow range instruction * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ hw->mask = attr->bp_len - 1; hw->len = X86_BREAKPOINT_LEN_1; } return 0; } /* * Validate the arch-specific HW Breakpoint register settings */ int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { unsigned int align; int ret; ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; switch (hw->len) { case X86_BREAKPOINT_LEN_1: align = 0; if (hw->mask) align = hw->mask; break; case X86_BREAKPOINT_LEN_2: align = 1; break; case X86_BREAKPOINT_LEN_4: align = 3; break; #ifdef CONFIG_X86_64 case X86_BREAKPOINT_LEN_8: align = 7; break; #endif default: WARN_ON_ONCE(1); return -EINVAL; } /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ if (hw->address & align) return -EINVAL; return 0; } /* * Release the user breakpoints used by ptrace */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { int i; struct thread_struct *t = &tsk->thread; for (i = 0; i < HBP_NUM; i++) { unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } t->virtual_dr6 = 0; t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) { set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); set_debugreg(DR6_RESERVED, 6); set_debugreg(__this_cpu_read(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); /* * Handle debug exception notifications. * * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. * * NOTIFY_DONE returned if one of the following conditions is true. * i) When the causative address is from user-space and the exception * is a valid one, i.e. not triggered as a result of lazy debug register * switching * ii) When there are more bits than trap<n> set in DR6 register (such * as BD, BS or BT) indicating that more than one debug condition is * met and requires some more action in do_debug(). * * NOTIFY_STOP returned for all other cases * */ static int hw_breakpoint_handler(struct die_args *args) { int i, rc = NOTIFY_STOP; struct perf_event *bp; unsigned long *dr6_p; unsigned long dr6; bool bpx; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; /* Handle all the breakpoints that were triggered */ for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; bp = this_cpu_read(bp_per_reg[i]); if (!bp) continue; bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; /* * TF and data breakpoints are traps and can be merged, however * instruction breakpoints are faults and will be raised * separately. * * However DR6 can indicate both TF and instruction * breakpoints. In that case take TF as that has precedence and * delay the instruction breakpoint for the next exception. */ if (bpx && (dr6 & DR_STEP)) continue; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling */ (*dr6_p) &= ~(DR_TRAP0 << i); perf_bp_event(bp, args->regs); /* * Set up resume flag to avoid breakpoint recursion when * returning back to origin. */ if (bpx) args->regs->flags |= X86_EFLAGS_RF; } /* * Further processing in do_debug() is needed for a) user-space * breakpoints (to generate signals) and b) when the system has * taken exception due to multiple causes */ if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; return rc; } /* * Handle debug exception notifications. */ int hw_breakpoint_exceptions_notify( struct notifier_block *unused, unsigned long val, void *data) { if (val != DIE_DEBUG) return NOTIFY_DONE; return hw_breakpoint_handler(data); } void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } |
56 56 56 56 56 55 55 41 41 41 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 | // SPDX-License-Identifier: GPL-2.0 #include <linux/memcontrol.h> #include <linux/rwsem.h> #include <linux/shrinker.h> #include <linux/rculist.h> #include <trace/events/vmscan.h> #include "internal.h" LIST_HEAD(shrinker_list); DEFINE_MUTEX(shrinker_mutex); #ifdef CONFIG_MEMCG static int shrinker_nr_max; static inline int shrinker_unit_size(int nr_items) { return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); } static inline void shrinker_unit_free(struct shrinker_info *info, int start) { struct shrinker_info_unit **unit; int nr, i; if (!info) return; unit = info->unit; nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); for (i = start; i < nr; i++) { if (!unit[i]) break; kfree(unit[i]); unit[i] = NULL; } } static inline int shrinker_unit_alloc(struct shrinker_info *new, struct shrinker_info *old, int nid) { struct shrinker_info_unit *unit; int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; int i; for (i = start; i < nr; i++) { unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid); if (!unit) { shrinker_unit_free(new, start); return -ENOMEM; } new->unit[i] = unit; } return 0; } void free_shrinker_info(struct mem_cgroup *memcg) { struct mem_cgroup_per_node *pn; struct shrinker_info *info; int nid; for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); shrinker_unit_free(info, 0); kvfree(info); rcu_assign_pointer(pn->shrinker_info, NULL); } } int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; int nid, ret = 0; int array_size = 0; mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); if (!info) goto err; info->map_nr_max = shrinker_nr_max; if (shrinker_unit_alloc(info, NULL, nid)) goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } mutex_unlock(&shrinker_mutex); return ret; err: mutex_unlock(&shrinker_mutex); free_shrinker_info(memcg); return -ENOMEM; } static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, lockdep_is_held(&shrinker_mutex)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, int old_size, int new_nr_max) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; for_each_node(nid) { pn = memcg->nodeinfo[nid]; old = shrinker_info_protected(memcg, nid); /* Not yet online memcg */ if (!old) return 0; /* Already expanded this shrinker_info */ if (new_nr_max <= old->map_nr_max) continue; new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); if (!new) return -ENOMEM; new->map_nr_max = new_nr_max; memcpy(new->unit, old->unit, old_size); if (shrinker_unit_alloc(new, old, nid)) { kvfree(new); return -ENOMEM; } rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); } return 0; } static int expand_shrinker_info(int new_id) { int ret = 0; int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); int new_size, old_size = 0; struct mem_cgroup *memcg; if (!root_mem_cgroup) goto out; lockdep_assert_held(&shrinker_mutex); new_size = shrinker_unit_size(new_nr_max); old_size = shrinker_unit_size(shrinker_nr_max); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { ret = expand_one_shrinker_info(memcg, new_size, old_size, new_nr_max); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); out: if (!ret) shrinker_nr_max = new_nr_max; return ret; } static inline int shrinker_id_to_index(int shrinker_id) { return shrinker_id / SHRINKER_UNIT_BITS; } static inline int shrinker_id_to_offset(int shrinker_id) { return shrinker_id % SHRINKER_UNIT_BITS; } static inline int calc_shrinker_id(int index, int offset) { return index * SHRINKER_UNIT_BITS + offset; } void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; struct shrinker_info_unit *unit; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker_id)]; if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id_to_offset(shrinker_id), unit->map); } rcu_read_unlock(); } } static DEFINE_IDR(shrinker_idr); static int shrinker_memcg_alloc(struct shrinker *shrinker) { int id, ret = -ENOMEM; if (mem_cgroup_disabled()) return -ENOSYS; mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; if (id >= shrinker_nr_max) { if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } } shrinker->id = id; ret = 0; unlock: mutex_unlock(&shrinker_mutex); return ret; } static void shrinker_memcg_remove(struct shrinker *shrinker) { int id = shrinker->id; BUG_ON(id < 0); lockdep_assert_held(&shrinker_mutex); idr_remove(&shrinker_idr, id); } static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; struct shrinker_info_unit *unit; long nr_deferred; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); rcu_read_unlock(); return nr_deferred; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; struct shrinker_info_unit *unit; long nr_deferred; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; nr_deferred = atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); rcu_read_unlock(); return nr_deferred; } void reparent_shrinker_deferred(struct mem_cgroup *memcg) { int nid, index, offset; long nr; struct mem_cgroup *parent; struct shrinker_info *child_info, *parent_info; struct shrinker_info_unit *child_unit, *parent_unit; parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ mutex_lock(&shrinker_mutex); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) { child_unit = child_info->unit[index]; parent_unit = parent_info->unit[index]; for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { nr = atomic_long_read(&child_unit->nr_deferred[offset]); atomic_long_add(nr, &parent_unit->nr_deferred[offset]); } } } mutex_unlock(&shrinker_mutex); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) { return -ENOSYS; } static void shrinker_memcg_remove(struct shrinker *shrinker) { } static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { return 0; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { return 0; } #endif /* CONFIG_MEMCG */ static long xchg_nr_deferred(struct shrinker *shrinker, struct shrink_control *sc) { int nid = sc->nid; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) nid = 0; if (sc->memcg && (shrinker->flags & SHRINKER_MEMCG_AWARE)) return xchg_nr_deferred_memcg(nid, shrinker, sc->memcg); return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); } static long add_nr_deferred(long nr, struct shrinker *shrinker, struct shrink_control *sc) { int nid = sc->nid; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) nid = 0; if (sc->memcg && (shrinker->flags & SHRINKER_MEMCG_AWARE)) return add_nr_deferred_memcg(nr, nid, shrinker, sc->memcg); return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, struct shrinker *shrinker, int priority) { unsigned long freed = 0; unsigned long long delta; long total_scan; long freeable; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ nr = xchg_nr_deferred(shrinker, shrinkctl); if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); } else { /* * These objects don't require any IO to create. Trim * them aggressively under memory pressure to keep * them from causing refetches in the IO caches. */ delta = freeable / 2; } total_scan = nr >> priority; total_scan += delta; total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one * pass to avoid too frequent shrinker calls, but if the slab has less * than batch_size objects in total and we are really tight on memory, * we will try to reclaim all available objects, otherwise we can end * up failing allocations although there are plenty of reclaimable * objects spread over several slabs with usage less than the * batch_size. * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; shrinkctl->nr_scanned = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); total_scan -= shrinkctl->nr_scanned; scanned += shrinkctl->nr_scanned; cond_resched(); } /* * The deferred work is increased by any new work (delta) that wasn't * done, decreased by old deferred work that was done now. * * And it is capped to two times of the freeable items. */ next_deferred = max_t(long, (nr + delta - scanned), 0); next_deferred = min(next_deferred, (2 * freeable)); /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } #ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct shrinker_info *info; unsigned long ret, freed = 0; int offset, index = 0; if (!mem_cgroup_online(memcg)) return 0; /* * lockless algorithm of memcg shrink. * * The shrinker_info may be freed asynchronously via RCU in the * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used * to ensure the existence of the shrinker_info. * * The shrinker_info_unit is never freed unless its corresponding memcg * is destroyed. Here we already hold the refcount of memcg, so the * memcg will not be destroyed, and of course shrinker_info_unit will * not be freed. * * So in the memcg shrink: * step 1: use rcu_read_lock() to guarantee existence of the * shrinker_info. * step 2: after getting shrinker_info_unit we can safely release the * RCU lock. * step 3: traverse the bitmap and calculate shrinker_id * step 4: use rcu_read_lock() to guarantee existence of the shrinker. * step 5: use shrinker_id to find the shrinker, then use * shrinker_try_get() to guarantee existence of the shrinker, * then we can release the RCU lock to do do_shrink_slab() that * may sleep. * step 6: do shrinker_put() paired with step 5 to put the refcount, * if the refcount reaches 0, then wake up the waiter in * shrinker_free() by calling complete(). * Note: here is different from the global shrink, we don't * need to acquire the RCU lock to guarantee existence of * the shrinker, because we don't need to use this * shrinker to traverse the next shrinker in the bitmap. * step 7: we have already exited the read-side of rcu critical section * before calling do_shrink_slab(), the shrinker_info may be * released in expand_one_shrinker_info(), so go back to step 1 * to reacquire the shrinker_info. */ again: rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (unlikely(!info)) goto unlock; if (index < shrinker_id_to_index(info->map_nr_max)) { struct shrinker_info_unit *unit; unit = info->unit[index]; rcu_read_unlock(); for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; struct shrinker *shrinker; int shrinker_id = calc_shrinker_id(index, offset); rcu_read_lock(); shrinker = idr_find(&shrinker_idr, shrinker_id); if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { clear_bit(offset, unit->map); rcu_read_unlock(); continue; } rcu_read_unlock(); /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && !(shrinker->flags & SHRINKER_NONSLAB)) continue; ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(offset, unit->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in * the memcg shrinker map, a new object might have been * added. To make sure, we have the bit set in this * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() * <MB> <MB> * set_bit() do_shrink_slab() */ smp_mb__after_atomic(); ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; else set_shrinker_bit(memcg, nid, shrinker_id); } freed += ret; shrinker_put(shrinker); } index++; goto again; } unlock: rcu_read_unlock(); return freed; } #else /* !CONFIG_MEMCG */ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { return 0; } #endif /* CONFIG_MEMCG */ /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target * @priority: the reclaim priority * * Call the shrink functions to age shrinkable caches. * * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * * @memcg specifies the memory cgroup to target. Unaware shrinkers * are called only if it is the root cgroup. * * @priority is sc->priority, we take the number of objects and >> by priority * in order to get the scan target. * * Returns the number of reclaimed slab objects. */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { unsigned long ret, freed = 0; struct shrinker *shrinker; /* * The root memcg might be allocated even though memcg is disabled * via "cgroup_disable=memory" boot parameter. This could make * mem_cgroup_is_root() return false, then just run memcg slab * shrink, but skip global shrink. This may result in premature * oom. */ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); /* * lockless algorithm of global shrink. * * In the unregistration setp, the shrinker will be freed asynchronously * via RCU after its refcount reaches 0. So both rcu_read_lock() and * shrinker_try_get() can be used to ensure the existence of the shrinker. * * So in the global shrink: * step 1: use rcu_read_lock() to guarantee existence of the shrinker * and the validity of the shrinker_list walk. * step 2: use shrinker_try_get() to try get the refcount, if successful, * then the existence of the shrinker can also be guaranteed, * so we can release the RCU lock to do do_shrink_slab() that * may sleep. * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), * which ensures that neither this shrinker nor the next shrinker * will be freed in the next traversal operation. * step 4: do shrinker_put() paired with step 2 to put the refcount, * if the refcount reaches 0, then wake up the waiter in * shrinker_free() by calling complete(). */ rcu_read_lock(); list_for_each_entry_rcu(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; if (!shrinker_try_get(shrinker)) continue; rcu_read_unlock(); ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; rcu_read_lock(); shrinker_put(shrinker); } rcu_read_unlock(); cond_resched(); return freed; } struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) { struct shrinker *shrinker; unsigned int size; va_list ap; int err; shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); if (!shrinker) return NULL; va_start(ap, fmt); err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); va_end(ap); if (err) goto err_name; shrinker->flags = flags | SHRINKER_ALLOCATED; shrinker->seeks = DEFAULT_SEEKS; if (flags & SHRINKER_MEMCG_AWARE) { err = shrinker_memcg_alloc(shrinker); if (err == -ENOSYS) { /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ shrinker->flags &= ~SHRINKER_MEMCG_AWARE; goto non_memcg; } if (err) goto err_flags; return shrinker; } non_memcg: /* * The nr_deferred is available on per memcg level for memcg aware * shrinkers, so only allocate nr_deferred in the following cases: * - non-memcg-aware shrinkers * - !CONFIG_MEMCG * - memcg is disabled by kernel command line */ size = sizeof(*shrinker->nr_deferred); if (flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) goto err_flags; return shrinker; err_flags: shrinker_debugfs_name_free(shrinker); err_name: kfree(shrinker); return NULL; } EXPORT_SYMBOL_GPL(shrinker_alloc); void shrinker_register(struct shrinker *shrinker) { if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker"); return; } mutex_lock(&shrinker_mutex); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); mutex_unlock(&shrinker_mutex); init_completion(&shrinker->done); /* * Now the shrinker is fully set up, take the first reference to it to * indicate that lookup operations are now allowed to use it via * shrinker_try_get(). */ refcount_set(&shrinker->refcount, 1); } EXPORT_SYMBOL_GPL(shrinker_register); static void shrinker_free_rcu_cb(struct rcu_head *head) { struct shrinker *shrinker = container_of(head, struct shrinker, rcu); kfree(shrinker->nr_deferred); kfree(shrinker); } void shrinker_free(struct shrinker *shrinker) { struct dentry *debugfs_entry = NULL; int debugfs_id; if (!shrinker) return; if (shrinker->flags & SHRINKER_REGISTERED) { /* drop the initial refcount */ shrinker_put(shrinker); /* * Wait for all lookups of the shrinker to complete, after that, * no shrinker is running or will run again, then we can safely * free it asynchronously via RCU and safely free the structure * where the shrinker is located, such as super_block etc. */ wait_for_completion(&shrinker->done); } mutex_lock(&shrinker_mutex); if (shrinker->flags & SHRINKER_REGISTERED) { /* * Now we can safely remove it from the shrinker_list and then * free it. */ list_del_rcu(&shrinker->list); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); shrinker->flags &= ~SHRINKER_REGISTERED; } shrinker_debugfs_name_free(shrinker); if (shrinker->flags & SHRINKER_MEMCG_AWARE) shrinker_memcg_remove(shrinker); mutex_unlock(&shrinker_mutex); if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); } EXPORT_SYMBOL_GPL(shrinker_free); |
918 835 67 918 877 830 67 67 22 22 22 22 918 850 918 856 302 302 302 302 856 766 83 741 712 81 113 742 766 675 12 675 746 193 193 193 765 766 766 675 765 17 765 669 668 668 668 668 668 668 17 17 17 765 150 765 745 192 192 765 150 747 851 851 850 851 3 3 3 3 851 845 729 729 729 845 766 766 766 766 766 219 766 219 766 766 765 765 765 218 765 3 598 696 765 851 805 805 851 851 811 152 18 18 20 20 20 7 20 20 18 18 4 20 2 2 20 152 152 152 152 152 152 152 320 134 320 320 152 130 130 33 320 152 152 320 134 134 134 125 125 125 124 124 124 124 124 12 12 134 311 311 311 311 304 11 1 1 11 11 11 311 304 116 308 7 311 311 7 12 12 12 12 1 11 6 2 2 2 8 8 8 6 3 1 1 1 1 3 3 3 3 3 2 2 1 1 2 2 11 4 4 11 9 9 8 6 5 5 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/indirect.c * * from * * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Goal-directed block allocation by Stephen Tweedie * (sct@redhat.com), 1993, 1998 */ #include "ext4_jbd2.h" #include "truncate.h" #include <linux/dax.h> #include <linux/uio.h> #include <trace/events/ext4.h> typedef struct { __le32 *p; __le32 key; struct buffer_head *bh; } Indirect; static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) { p->key = *(p->p = v); p->bh = bh; } /** * ext4_block_to_path - parse the block number into array of offsets * @inode: inode in question (we are only interested in its superblock) * @i_block: block number to be parsed * @offsets: array to store the offsets in * @boundary: set this non-zero if the referred-to block is likely to be * followed (on disk) by an indirect block. * * To store the locations of file's data ext4 uses a data structure common * for UNIX filesystems - tree of pointers anchored in the inode, with * data blocks at leaves and indirect blocks in intermediate nodes. * This function translates the block number into path in that tree - * return value is the path length and @offsets[n] is the offset of * pointer to (n+1)th node in the nth one. If @block is out of range * (negative or too large) warning is printed and zero returned. * * Note: function doesn't find node addresses, so no IO is needed. All * we need to know is the capacity of indirect blocks (taken from the * inode->i_sb). */ /* * Portability note: the last comparison (check that we fit into triple * indirect block) is spelled differently, because otherwise on an * architecture with 32-bit longs and 8Kb pages we might get into trouble * if our filesystem had 8Kb blocks. We might use long long, but that would * kill us on x86. Oh, well, at least the sign propagation does not matter - * i_block would have to be negative in the very beginning, so we would not * get there at all. */ static int ext4_block_to_path(struct inode *inode, ext4_lblk_t i_block, ext4_lblk_t offsets[4], int *boundary) { int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); const long direct_blocks = EXT4_NDIR_BLOCKS, indirect_blocks = ptrs, double_blocks = (1 << (ptrs_bits * 2)); int n = 0; int final = 0; if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = EXT4_IND_BLOCK; offsets[n++] = i_block; final = ptrs; } else if ((i_block -= indirect_blocks) < double_blocks) { offsets[n++] = EXT4_DIND_BLOCK; offsets[n++] = i_block >> ptrs_bits; offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { offsets[n++] = EXT4_TIND_BLOCK; offsets[n++] = i_block >> (ptrs_bits * 2); offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); return n; } /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question * @depth: depth of the chain (1 - direct pointer, etc.) * @offsets: offsets of pointers in inode/indirect blocks * @chain: place to store the result * @err: here we store the error value * * Function fills the array of triples <key, p, bh> and returns %NULL * if everything went OK or the pointer to the last filled triple * (incomplete one) otherwise. Upon the return chain[i].key contains * the number of (i+1)-th block in the chain (as it is stored in memory, * i.e. little-endian 32-bit), chain[i].p contains the address of that * number (it points into struct inode for i==0 and into the bh->b_data * for i>0) and chain[i].bh points to the buffer_head of i-th indirect * block for i>0 and NULL for i==0. In other words, it holds the block * numbers of the chain, addresses they were taken from (and where we can * verify that chain did not change) and buffer_heads hosting these * numbers. * * Function stops when it stumbles upon zero pointer (absent block) * (pointer to last triple returned, *@err == 0) * or when it gets an IO error reading an indirect block * (ditto, *@err == -EIO) * or when it reads all @depth-1 indirect blocks successfully and finds * the whole chain, all way to the data (returns %NULL, *err == 0). * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) */ static Indirect *ext4_get_branch(struct inode *inode, int depth, ext4_lblk_t *offsets, Indirect chain[4], int *err) { struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; unsigned int key; int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); if (!p->key) goto no_block; while (--depth) { key = le32_to_cpu(p->key); if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { /* the block was out of range */ ret = -EFSCORRUPTED; goto failure; } bh = sb_getblk(sb, key); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; } if (!bh_uptodate_or_lock(bh)) { if (ext4_read_bh(bh, 0, NULL) < 0) { put_bh(bh); goto failure; } /* validate block references */ if (ext4_check_indirect_blockref(inode, bh)) { put_bh(bh); goto failure; } } add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; } return NULL; failure: *err = ret; no_block: return p; } /** * ext4_find_near - find a place for allocation with sufficient locality * @inode: owner * @ind: descriptor of indirect block. * * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same * cylinder group. * * In the latter case we colour the starting block by the callers PID to * prevent it from clashing with concurrent allocations for a different inode * in the same block group. The PID is used here so that functionally related * files will be close-by on-disk. * * Caller must make sure that @ind is valid and will stay that way. */ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { if (*p) return le32_to_cpu(*p); } /* No such thing, so let's try location of indirect block */ if (ind->bh) return ind->bh->b_blocknr; /* * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ return ext4_inode_to_goal_block(inode); } /** * ext4_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * * Normally this function find the preferred place for block allocation, * returns it. * Because this is only used for non-extent files, we limit the block nr * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { ext4_fsblk_t goal; /* * XXX need to get goal block from mballoc's data structures */ goal = ext4_find_near(inode, partial); goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; return goal; } /** * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks * @k: number of blocks need for indirect blocks * @blks: number of data blocks to be mapped. * @blocks_to_boundary: the offset in the indirect block * * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet * then it's clear blocks on that path have not allocated */ if (k > 0) { /* right now we don't handle cross boundary allocation */ if (blks < blocks_to_boundary + 1) count += blks; else count += blocks_to_boundary + 1; return count; } count++; while (count < blks && count <= blocks_to_boundary && le32_to_cpu(*(branch[0].p + count)) == 0) { count++; } return count; } /** * ext4_alloc_branch() - allocate and set up a chain of blocks * @handle: handle for this transaction * @ar: structure describing the allocation request * @indirect_blks: number of allocated indirect blocks * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * * This function allocates blocks, zeroes out all but the last one, * links them into chain and (if we are synchronous) writes them to disk. * In other words, it prepares a branch that can be spliced onto the * inode. It stores the information about that chain in the branch[], in * the same format as ext4_get_branch() would do. We are calling it after * we had read the existing part of chain and partial points to the last * triple of that (one with zero ->key). Upon the exit we have the same * picture as after the successful ext4_get_block(), except that in one * place chain is disconnected - *branch->p is still zero (we did not * set the last link), but branch->key contains the number that should * be placed into *branch->p to fill that gap. * * If allocation fails we free all blocks we've allocated (and forget * their buffer_heads) and return the error value the from failed * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct ext4_allocation_request *ar, int indirect_blks, ext4_lblk_t *offsets, Indirect *branch) { struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); /* Simplify error cleanup... */ branch[i+1].bh = NULL; } if (err) { i--; goto failed; } branch[i].key = cpu_to_le32(new_blocks[i]); if (i == 0) continue; bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, ar->inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto failed; } memset(bh->b_data, 0, bh->b_size); p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; b = new_blocks[i]; if (i == indirect_blks) len = ar->len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, bh); if (err) goto failed; } return 0; failed: if (i == indirect_blks) { /* Free data blocks */ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], ar->len, 0); i--; } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect * blocks. Buffer for new_blocks[i] is at branch[i+1].bh * (buffer at branch[0].bh is indirect block / inode already * existing before ext4_alloc_branch() was called). Also * because blocks are freshly allocated, we don't need to * revoke them which is why we don't set * EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, branch[i+1].bh, new_blocks[i], 1, branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } /** * ext4_splice_branch() - splice the allocated branch onto inode. * @handle: handle for this transaction * @ar: structure describing the allocation request * @where: location of missing link * @num: number of indirect blocks we are adding * * This function fills the missing link and does all housekeeping needed in * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ static int ext4_splice_branch(handle_t *handle, struct ext4_allocation_request *ar, Indirect *where, int num) { int i; int err = 0; ext4_fsblk_t current_block; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block * before the splice. */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, ar->inode->i_sb, where->bh, EXT4_JTR_NONE); if (err) goto err_out; } /* That's it */ *where->p = where->key; /* * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ if (num == 0 && ar->len > 1) { current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < ar->len; i++) *(where->p + i) = cpu_to_le32(current_block++); } /* We are done with atomic stuff, now do the rest of housekeeping */ /* had we spliced it onto indirect block? */ if (where->bh) { /* * If we spliced it onto an indirect block, we haven't * altered the inode. Note however that if it is being spliced * onto an indirect block at the very end of the file (the * file is growing) then we *will* alter the inode to reflect * the new i_size. But that is not done here - it is done in * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ ext4_debug("splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ err = ext4_mark_inode_dirty(handle, ar->inode); if (unlikely(err)) goto err_out; ext4_debug("splicing direct\n"); } return err; err_out: for (i = 1; i <= num; i++) { /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), ar->len, 0); return err; } /* * The ext4_ind_map_blocks() function handles non-extents inodes * (i.e., using the traditional indirect/double-indirect i_blocks * scheme) for ext4_map_blocks(). * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is * required, recheck the path, free and repeat if check fails, otherwise * set the last missing link (that will protect us from any truncate-generated * removals - all blocks on the path are immune now) and possibly force the * write on the parent block. * That has a nice additional property: no special recovery from the failed * allocations is needed - we simply release blocks and do not touch anything * reachable from inode. * * `handle' can be NULL if create == 0. * * return > 0, # of blocks mapped or allocated. * return = 0, if plain lookup failed. * return < 0, error case. * * The ext4_ind_get_blocks() function should be called with * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_allocation_request ar; int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; int indirect_blks; int blocks_to_boundary = 0; int depth; int count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); if (depth == 0) goto out; partial = ext4_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); count++; /*map more blocks*/ while (count < map->m_len && count <= blocks_to_boundary) { ext4_fsblk_t blk; blk = le32_to_cpu(*(chain[depth-1].p + count)); if (blk == first_block + count) count++; else break; } goto got_it; } /* Next simple case - plain lookup failed */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; /* * Count number blocks in a subtree under 'partial'. At each * level we count number of complete empty subtrees beyond * current offset and then descend into the subtree only * partially beyond current offset. */ count = 0; for (i = partial - chain + 1; i < depth; i++) count = count * epb + (epb - offsets[i] - 1); count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); goto cleanup; } /* Failed read of indirect block */ if (err == -EIO) goto cleanup; /* * Okay, we need to do block allocation. */ if (ext4_has_feature_bigalloc(inode->i_sb)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); err = -EFSCORRUPTED; goto out; } /* Set up for the direct block allocation */ memset(&ar, 0, sizeof(ar)); ar.inode = inode; ar.logical = map->m_lblk; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; /* * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ ar.len = ext4_blks_to_allocate(partial, indirect_blks, map->m_len, blocks_to_boundary); /* * Block out ext4_truncate while we alter the tree */ err = ext4_alloc_branch(handle, &ar, indirect_blks, offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers * on the new chain if there is a failure, but that risks using * up transaction credits, especially for bitmaps where the * credits cannot be returned. Can we handle this somehow? We * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) err = ext4_splice_branch(handle, &ar, partial, indirect_blks); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; /* * Update reserved blocks/metadata blocks after successful block * allocation which had been deferred till now. */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ext4_da_update_reserve_space(inode, count, 1); got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); map->m_len = count; if (count > blocks_to_boundary) map->m_flags |= EXT4_MAP_BOUNDARY; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ cleanup: while (partial > chain) { BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } out: trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } /* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { /* * With N contiguous data blocks, we need at most * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, * 2 dindirect blocks, and 1 tindirect block */ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, int *dropped) { int err; if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) return err; } err = ext4_mark_inode_dirty(handle, inode); if (unlikely(err)) return err; /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a convenient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If * extend fails, we restart transaction. */ static int ext4_ind_truncate_ensure_credits(handle_t *handle, struct inode *inode, struct buffer_head *bh, int revoke_creds) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_blocks_for_truncate(inode), revoke_creds, ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); if (ret <= 0) return ret; if (bh) { BUFFER_TRACE(bh, "retaking write access"); ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(ret)) return ret; } return 0; } /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. * Linus? */ static inline int all_zeroes(__le32 *p, __le32 *q) { while (p < q) if (*p++) return 0; return 1; } /** * ext4_find_shared - find the indirect blocks for partial truncation. * @inode: inode in question * @depth: depth of the affected branch * @offsets: offsets of pointers in that branch (see ext4_block_to_path) * @chain: place to store the pointers to partial indirect blocks * @top: place to the (detached) top of branch * * This is a helper function used by ext4_truncate(). * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation * past the truncation point is possible until ext4_truncate() * finishes, we may safely do the latter, but top of branch may * require special attention - pageout below the truncation point * might try to populate it. * * We atomically detach the top of branch from the tree, store the * block number of its root in *@top, pointers to buffer_heads of * partially truncated blocks - in @chain[].bh and pointers to * their last elements that should not be removed - in * @chain[].p. Return value is the pointer to last filled element * of @chain. * * The work left to caller to do the actual freeing of subtrees: * a) free the subtree starting from *@top * b) free the subtrees whose roots are stored in * (@chain[i].p+1 .. end of @chain[i].bh->b_data) * c) free the subtrees growing from the inode past the @chain[0]. * (no partially truncated stuff there). */ static Indirect *ext4_find_shared(struct inode *inode, int depth, ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) { Indirect *partial, *p; int k, err; *top = 0; /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext4_get_branch(inode, k, offsets, chain, &err); /* Writer: pointers */ if (!partial) partial = chain + k-1; /* * If the branch acquired continuation since we've looked at it - * fine, it should all survive and (new) top doesn't belong to us. */ if (!partial->key && *partial->p) /* Writer: end */ goto no_top; for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our * branch should be detached before unlocking. However, if that rest * of branch is all ours and does not grow immediately from the inode * it's easier to cheat and just decrement partial->p. */ if (p == chain + k - 1 && p > chain) { p->p--; } else { *top = *p->p; /* Nope, don't do this in ext4. Must leave the tree intact */ #if 0 *p->p = 0; #endif } /* Writer: end */ while (partial > p) { brelse(partial->bh); partial--; } no_top: return partial; } /* * Zero a number of block pointers in either an inode or an indirect block. * If we restart the transaction we must again get write access to the * indirect block for further modification. * * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. * * Return 0 on success, 1 on invalid block range * and < 0 on fatal error. */ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block_to_free, unsigned long count, __le32 *first, __le32 *last) { __le32 *p; int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); return 1; } err = ext4_ind_truncate_ensure_credits(handle, inode, bh, ext4_free_data_revoke_credits(inode, count)); if (err < 0) goto out_err; for (p = first; p < last; p++) *p = 0; ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); return 0; out_err: ext4_std_error(inode->i_sb, err); return err; } /** * ext4_free_data - free a list of data blocks * @handle: handle for this transaction * @inode: inode we are dealing with * @this_bh: indirect buffer_head which contains *@first and *@last * @first: array of block numbers * @last: points immediately past the end of array * * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these * blocks are contiguous then releasing them at one time will only affect one * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't * actually use a lot of journal space. * * @this_bh will be %NULL if @first and @last point into the inode's direct * block pointers. */ static void ext4_free_data(handle_t *handle, struct inode *inode, struct buffer_head *this_bh, __le32 *first, __le32 *last) { ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ ext4_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ int err = 0; if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, this_bh, EXT4_JTR_NONE); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) return; } for (p = first; p < last; p++) { nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ if (count == 0) { block_to_free = nr; block_to_free_p = p; count = 1; } else if (nr == block_to_free + count) { count++; } else { err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err) break; block_to_free = nr; block_to_free_p = p; count = 1; } } } if (!err && count > 0) err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err < 0) /* fatal error */ return; if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this * point. However, if the data is corrupted and an indirect * block pointed to itself, it would have been detached when * the block was cleared. Check for this instead of OOPSing. */ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else EXT4_ERROR_INODE(inode, "circular indirect block detected at " "block %llu", (unsigned long long) this_bh->b_blocknr); } } /** * ext4_free_branches - free an array of branches * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @parent_bh: the buffer_head which contains *@first and *@last * @first: array of block numbers * @last: pointer immediately past the end of array * @depth: depth of the branches to free * * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ static void ext4_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *first, __le32 *last, int depth) { ext4_fsblk_t nr; __le32 *p; if (ext4_handle_is_aborted(handle)) return; if (depth--) { struct buffer_head *bh; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); p = last; while (--p >= first) { nr = le32_to_cpu(*p); if (!nr) continue; /* A hole */ if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", (unsigned long) nr, depth); break; } /* Go read the buffer for the next level down */ bh = ext4_sb_bread(inode->i_sb, nr, 0); /* * A read failure? Report error and clear slot * (should be rare). */ if (IS_ERR(bh)) { ext4_error_inode_block(inode, nr, -PTR_ERR(bh), "Read failure"); continue; } /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); ext4_free_branches(handle, inode, bh, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); brelse(bh); /* * Everything below this pointer has been * released. Now let this top-of-subtree go. * * We want the freeing of this indirect block to be * atomic in the journal with the updating of the * bitmap block which owns it. So make some room in * the journal. * * We zero the parent pointer *after* freeing its * pointee in the bitmaps, so if extend_transaction() * for some reason fails to put the bitmap changes and * the release into the same transaction, recovery * will merely complain about releasing a free block, * rather than leaking blocks. */ if (ext4_handle_is_aborted(handle)) return; if (ext4_ind_truncate_ensure_credits(handle, inode, NULL, ext4_free_metadata_revoke_credits( inode->i_sb, 1)) < 0) return; /* * The forget flag here is critical because if * we are journaling (and not doing data * journaling), we have to make sure a revoke * record is written to prevent the journal * replay from overwriting the (former) * indirect block if it gets reallocated as a * data block. This must happen in the same * transaction where the data blocks are * actually freed. */ ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA| EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* * The block which we have just freed is * pointed to by an indirect block: journal it */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, inode->i_sb, parent_bh, EXT4_JTR_NONE)) { *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, inode, parent_bh); } } } } else { /* We have reached the bottom of the tree. */ BUFFER_TRACE(parent_bh, "free data blocks"); ext4_free_data(handle, inode, parent_bh, first, last); } } void ext4_ind_truncate(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) return; } ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the * on-disk inode. We do this via i_disksize, which is the value which * ext4 *really* writes onto the disk inode. */ ei->i_disksize = inode->i_size; if (last_block == max_block) { /* * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); goto do_indirects; } partial = ext4_find_shared(inode, n, offsets, chain, &nr); /* Kill the top of shared branch (not detached) */ if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; /* * We mark the inode dirty prior to restart, * and prior to stop. No need for it here. */ } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* Clear the ends of indirect blocks on the shared branch */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32*)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } } /** * ext4_ind_remove_space - remove space from the range * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @start: First block to remove * @end: One block after the last block to remove (exclusive) * * Free the blocks in the defined range (end is exclusive endpoint of * range). This is used by ext4_punch_hole(). */ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; unsigned blocksize = inode->i_sb->s_blocksize; max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (end >= max_block) end = max_block; if ((start >= end) || (start > max_block)) return 0; n = ext4_block_to_path(inode, start, offsets, NULL); n2 = ext4_block_to_path(inode, end, offsets2, NULL); BUG_ON(n > n2); if ((n == 1) && (n == n2)) { /* We're punching only within direct block range */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + offsets2[0]); return 0; } else if (n2 > n) { /* * Start and end are on a different levels so we're going to * free partial block at start, and partial block at end of * the range. If there are some levels in between then * do_indirects label will take care of that. */ if (n == 1) { /* * Start is at the direct block level, free * everything to the end of the level. */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + EXT4_NDIR_BLOCKS); goto end_range; } partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* * Clear the ends of indirect blocks on the shared branch * at the start of the range */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } end_range: partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* * Remember, end is exclusive so here we're at * the start of the next level we're not going * to free. Everything was covered by the start * of the range. */ goto do_indirects; } } else { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } /* * Clear the ends of indirect blocks on the shared branch * at the end of the range */ while (partial2 > chain2) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { int level = min(partial - chain, partial2 - chain2); int i; int subtree = 1; for (i = 0; i <= level; i++) { if (offsets[i] != offsets2[i]) { subtree = 0; break; } } if (!subtree) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } } if (!nr2) { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } while (partial > chain || partial2 > chain2) { int depth = (chain+n-1) - partial; int depth2 = (chain2+n2-1) - partial2; if (partial > chain && partial2 > chain2 && partial->bh->b_blocknr == partial2->bh->b_blocknr) { /* * We've converged on the same block. Clear the range, * then we're done. */ ext4_free_branches(handle, inode, partial->bh, partial->p + 1, partial2->p, (chain+n-1) - partial); goto cleanup; } /* * The start and end partial branches may not be at the same * level even though the punch happened within one level. So, we * give them a chance to arrive at the same level, then walk * them in step with each other until we converge on the same * block. */ if (partial > chain && depth <= depth2) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } if (partial2 > chain2 && depth2 <= depth) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } } cleanup: while (p && p > chain) { BUFFER_TRACE(p->bh, "call brelse"); brelse(p->bh); p--; } while (p2 && p2 > chain2) { BUFFER_TRACE(p2->bh, "call brelse"); brelse(p2->bh); p2--; } return 0; do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: if (++n >= n2) break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } goto cleanup; } |
279 280 280 15 14 1 1 1 15 15 15 253 175 279 189 15 15 15 253 1 1 1 1 1 1 148 280 280 280 15 15 280 15 15 15 280 253 253 253 1 253 1 1 1 1 1 253 253 280 254 254 1 253 253 252 253 253 1 16 13 13 13 9 9 1 16 424 425 425 15 280 280 280 277 280 19 19 19 19 425 425 425 15 425 254 785 785 254 253 253 148 147 148 208 208 208 201 201 25 26 208 1719 1719 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: Internet Group Management Protocol [IGMP] * * This code implements the IGMP protocol as defined in RFC1112. There has * been a further revision of this protocol since which is now supported. * * If you have trouble with this module be careful what gcc you have used, * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * * Alan Cox : Added lots of __inline__ to optimise * the memory usage of all the tiny little * functions. * Alan Cox : Dumped the header building experiment. * Alan Cox : Minor tweaks ready for multicast routing * and extended IGMP protocol. * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 * writes utterly bogus code otherwise (sigh) * fixed IGMP loopback to behave in the manner * desired by mrouted, fixed the fact it has been * broken since 1.3.6 and cleaned up a few minor * points. * * Chih-Jen Chang : Tried to revise IGMP to Version 2 * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu * The enhancements are mainly based on Steve Deering's * ipmulti-3.5 source code. * Chih-Jen Chang : Added the igmp_get_mrouter_info and * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of * the mrouted version on that device. * Chih-Jen Chang : Added the max_resp_time parameter to * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter * to identify the multicast router version * and do what the IGMP version 2 specified. * Chih-Jen Chang : Added a timer to revert to IGMP V2 router * Tsu-Sheng Tsao if the specified time expired. * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. * Alan Cox : Use GFP_ATOMIC in the right places. * Christian Daudt : igmp timer wasn't set for local group * memberships but was being deleted, * which caused a "del_timer() called * from %p with timer not initialized\n" * message (960131). * Christian Daudt : removed del_timer from * igmp_timer_expire function (960205). * Christian Daudt : igmp_heard_report now only calls * igmp_timer_expire if tm->running is * true (960216). * Malcolm Beattie : ttl comparison wrong in igmp_rcv made * igmp_heard_query never trigger. Expiry * miscalculation fixed in igmp_heard_query * and random() made to return unsigned to * prevent negative expiry times. * Alexey Kuznetsov: Wrong group leaving behaviour, backport * fix from pending 2.1.x patches. * Alan Cox: Forget to enable FDDI support earlier. * Alexey Kuznetsov: Fixed leaving groups on device down. * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. * David L Stevens: IGMPv3 support, with help from * Vinay Kulkarni */ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> #include <linux/pkt_sched.h> #include <linux/byteorder/generic.h> #include <net/net_namespace.h> #include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/sock.h> #include <net/checksum.h> #include <net/inet_common.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> #endif #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ #define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) #define IGMP_INITIAL_REPORT_DELAY (1) /* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not * contradict to specs provided this delay is small enough. */ #define IGMP_V1_SEEN(in_dev) \ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) static int unsolicited_report_interval(struct in_device *in_dev) { int interval_ms, interval_jiffies; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV2_UNSOLICITED_REPORT_INTERVAL); else /* v3 */ interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV3_UNSOLICITED_REPORT_INTERVAL); interval_jiffies = msecs_to_jiffies(interval_ms); /* _timer functions can't handle a delay of 0 jiffies so ensure * we always return a positive value. */ if (interval_jiffies <= 0) interval_jiffies = 1; return interval_jiffies; } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp); static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); static void sf_markstate(struct ip_mc_list *pmc); #endif static void ip_mc_clear_src(struct ip_mc_list *pmc); static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta); static void ip_ma_put(struct ip_mc_list *im) { if (refcount_dec_and_test(&im->refcnt)) { in_dev_put(im->interface); kfree_rcu(im, rcu); } } #define for_each_pmc_rcu(in_dev, pmc) \ for (pmc = rcu_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rcu_dereference(pmc->next_rcu)) #define for_each_pmc_rtnl(in_dev, pmc) \ for (pmc = rtnl_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rtnl_dereference(pmc->next_rcu)) static void ip_sf_list_clear_all(struct ip_sf_list *psf) { struct ip_sf_list *next; while (psf) { next = psf->sf_next; kfree(psf); psf = next; } } #ifdef CONFIG_IP_MULTICAST /* * Timer management */ static void igmp_stop_timer(struct ip_mc_list *im) { spin_lock_bh(&im->lock); if (del_timer(&im->timer)) refcount_dec(&im->refcnt); im->tm_running = 0; im->reporter = 0; im->unsolicit_count = 0; spin_unlock_bh(&im->lock); } /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { int tv = get_random_u32_below(max_delay); im->tm_running = 1; if (refcount_inc_not_zero(&im->refcnt)) { if (mod_timer(&im->timer, jiffies + tv + 2)) ip_ma_put(im); } } static void igmp_gq_start_timer(struct in_device *in_dev) { int tv = get_random_u32_below(in_dev->mr_maxdelay); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && time_after_eq(exp, (in_dev->mr_gq_timer).expires)) return; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, exp)) in_dev_hold(in_dev); } static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { int tv = get_random_u32_below(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); } static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) { spin_lock_bh(&im->lock); im->unsolicit_count = 0; if (del_timer(&im->timer)) { if ((long)(im->timer.expires-jiffies) < max_delay) { add_timer(&im->timer); im->tm_running = 1; spin_unlock_bh(&im->lock); return; } refcount_dec(&im->refcnt); } igmp_start_timer(im, max_delay); spin_unlock_bh(&im->lock); } /* * Send an IGMP report. */ #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, int gdeleted, int sdeleted) { switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return 0; if (!(pmc->gsquery && !psf->sf_gsresp)) { if (pmc->sfmode == MCAST_INCLUDE) return 1; /* don't include if this source is excluded * in all filters */ if (psf->sf_count[MCAST_INCLUDE]) return type == IGMPV3_MODE_IS_INCLUDE; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } return 0; case IGMPV3_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return 0; return psf->sf_count[MCAST_INCLUDE] != 0; case IGMPV3_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) return 0; if (pmc->sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) return 0; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case IGMPV3_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) return 0; return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted; case IGMPV3_BLOCK_OLD_SOURCES: if (pmc->sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } return 0; } static int igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct ip_sf_list *psf; int scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; } return scount; } /* source address selection per RFC 3376 section 4.2.13 */ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } return htonl(INADDR_ANY); } static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int size; size = min(mtu, IP_MAX_MTU); while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); if (skb) break; size >>= 1; if (size < 256) return NULL; } skb->priority = TC_PRIO_CONTROL; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) { kfree_skb(skb); return NULL; } skb_dst_set(skb, &rt->dst); skb->dev = dev; skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); skb_reset_network_header(skb); pip = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); pip->version = 4; pip->ihl = (sizeof(struct iphdr)+4)>>2; pip->tos = 0xc0; pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; rcu_read_lock(); pip->saddr = igmpv3_get_srcaddr(dev, &fl4); rcu_read_unlock(); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; ((u8 *)&pip[1])[3] = 0; skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; skb_put(skb, sizeof(*pig)); pig = igmpv3_report_hdr(skb); pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; pig->resv1 = 0; pig->csum = 0; pig->resv2 = 0; pig->ngrec = 0; return skb; } static int igmpv3_sendpack(struct sk_buff *skb) { struct igmphdr *pig = igmp_hdr(skb); const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) { return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel); } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; if (!skb) { skb = igmpv3_newpack(dev, mtu); if (!skb) return NULL; } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->multiaddr; pih = igmpv3_report_hdr(skb); pih->ngrec = htons(ntohs(pih->ngrec)+1); *ppgr = pgr; return skb; } #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return skb; mtu = READ_ONCE(dev->mtu); if (mtu < IPV4_MIN_MTU) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || type == IGMPV3_CHANGE_TO_EXCLUDE; stotal = scount = 0; psf_list = sdeleted ? &pmc->tomb : &pmc->sources; if (!*psf_list) goto empty_source; pih = skb ? igmpv3_report_hdr(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { if (pih && pih->ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); } } first = 1; psf_prev = NULL; for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { psf_prev = psf; continue; } /* Based on RFC3376 5.1. Should not send source-list change * records when there is a filter mode change. */ if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) || (!gdeleted && pmc->crcount)) && (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) goto decrease_sf_crcount; /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; if (AVAILABLE(skb) < sizeof(__be32) + first*sizeof(struct igmpv3_grec)) { if (truncate && !first) break; /* truncate these */ if (pgr) pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) return NULL; psrc = skb_put(skb, sizeof(__be32)); *psrc = psf->sf_inaddr; scount++; stotal++; if ((type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *psf_list = psf->sf_next; kfree(psf); continue; } } psf_prev = psf; } empty_source: if (!stotal) { if (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) { igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) pgr->grec_nsrcs = htons(scount); if (isquery) pmc->gsquery = 0; /* clear query state on report */ return skb; } static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); } else { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } if (!skb) return 0; return igmpv3_sendpack(skb); } /* * remove zero-count source records from a source filter list */ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) { struct ip_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *ppsf = psf->sf_next; kfree(psf); } else psf_prev = psf; } } static void kfree_pmc(struct ip_mc_list *pmc) { ip_sf_list_clear_all(pmc->sources); ip_sf_list_clear_all(pmc->tomb); kfree(pmc); } static void igmpv3_send_cr(struct in_device *in_dev) { struct ip_mc_list *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; rcu_read_lock(); spin_lock_bh(&in_dev->mc_tomb_lock); /* deleted MCA's */ pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; skb = add_grec(skb, pmc, type, 1, 0); skb = add_grec(skb, pmc, dtype, 1, 1); } if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) { type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0); } pmc->crcount--; if (pmc->crcount == 0) { igmpv3_clear_zeros(&pmc->tomb); igmpv3_clear_zeros(&pmc->sources); } } if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) { if (pmc_prev) pmc_prev->next = pmc_next; else in_dev->mc_tomb = pmc_next; in_dev_put(pmc->interface); kfree_pmc(pmc); } else pmc_prev = pmc; } spin_unlock_bh(&in_dev->mc_tomb_lock); /* change recs */ for_each_pmc_rcu(in_dev, pmc) { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_ALLOW_NEW_SOURCES; } else { type = IGMPV3_ALLOW_NEW_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; } skb = add_grec(skb, pmc, type, 0, 0); skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ /* filter mode changes */ if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) type = IGMPV3_CHANGE_TO_EXCLUDE; else type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); pmc->crcount--; } spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); if (!skb) return; (void) igmpv3_sendpack(skb); } static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, int type) { struct sk_buff *skb; struct iphdr *iph; struct igmphdr *ih; struct rtable *rt; struct net_device *dev = in_dev->dev; struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; struct flowi4 fl4; __be32 dst; int hlen, tlen; if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) return -1; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); if (!skb) { ip_rt_put(rt); return -1; } skb->priority = TC_PRIO_CONTROL; skb_dst_set(skb, &rt->dst); skb_reserve(skb, hlen); skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); iph->version = 4; iph->ihl = (sizeof(struct iphdr)+4)>>2; iph->tos = 0xc0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(net, skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; ih = skb_put(skb, sizeof(struct igmphdr)); ih->type = type; ih->code = 0; ih->csum = 0; ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(struct timer_list *t) { struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer); in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); in_dev_put(in_dev); } static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); u32 mr_ifc_count; igmpv3_send_cr(in_dev); restart: mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count); if (mr_ifc_count) { if (cmpxchg(&in_dev->mr_ifc_count, mr_ifc_count, mr_ifc_count - 1) != mr_ifc_count) goto restart; igmp_ifc_start_timer(in_dev, unsolicited_report_interval(in_dev)); } in_dev_put(in_dev); } static void igmp_ifc_event(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); igmp_ifc_start_timer(in_dev, 1); } static void igmp_timer_expire(struct timer_list *t) { struct ip_mc_list *im = from_timer(im, t, timer); struct in_device *in_dev = im->interface; spin_lock(&im->lock); im->tm_running = 0; if (im->unsolicit_count && --im->unsolicit_count) igmp_start_timer(im, unsolicited_report_interval(in_dev)); im->reporter = 1; spin_unlock(&im->lock); if (IGMP_V1_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); else if (IGMP_V2_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); else igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); ip_ma_put(im); } /* mark EXCLUDE-mode sources */ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) break; if (srcs[i] == psf->sf_inaddr) { scount++; break; } } } pmc->gsquery = 0; if (scount == nsrcs) /* all sources excluded */ return 0; return 1; } static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; if (pmc->sfmode == MCAST_EXCLUDE) return igmp_xmarksources(pmc, nsrcs, srcs); /* mark INCLUDE-mode sources */ scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) if (srcs[i] == psf->sf_inaddr) { psf->sf_gsresp = 1; scount++; break; } } if (!scount) { pmc->gsquery = 0; return 0; } pmc->gsquery = 1; return 1; } /* return true if packet was dropped */ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == group) { igmp_stop_timer(im); break; } } rcu_read_unlock(); return false; } /* return true if packet was dropped */ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int len) { struct igmphdr *ih = igmp_hdr(skb); struct igmpv3_query *ih3 = igmpv3_query_hdr(skb); struct ip_mc_list *im; __be32 group = ih->group; int max_delay; int mark = 0; struct net *net = dev_net(in_dev->dev); if (len == 8) { if (ih->code == 0) { /* Alas, old v1 router presents here. */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ igmpv3_clear_delrec(in_dev); } else if (len < 12) { return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; * Interpretation of the max_delay code is problematic here. * A real v2 host would use ih_code directly, while v3 has a * different encoding. We use the v3 encoding as more likely * to be intended in a v3 query. */ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return true; ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) { if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + ntohs(ih3->nsrcs)*sizeof(__be32))) return true; ih3 = igmpv3_query_hdr(skb); } max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ in_dev->mr_maxdelay = max_delay; /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently * received value was zero, use the default or statically * configured value. */ in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; /* RFC3376, 8.3. Query Response Interval: * The number of seconds represented by the [Query Response * Interval] must be less than the [Query Interval]. */ if (in_dev->mr_qri >= in_dev->mr_qi) in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ igmp_gq_start_timer(in_dev); return false; } /* mark sources to include, if group & source-specific */ mark = ih3->nsrcs != 0; } /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to a "local" group (224.0.0.X) * - For timers already running check if they need to * be reset. * - Use the igmp->igmp_code field as the maximum * delay possible */ rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { int changed; if (group && group != im->multiaddr) continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; else im->gsquery = mark; changed = !im->gsquery || igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs); spin_unlock_bh(&im->lock); if (changed) igmp_mod_timer(im, max_delay); } rcu_read_unlock(); return false; } /* called in rcu_read_lock() section */ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; struct net_device *dev = skb->dev; struct in_device *in_dev; int len = skb->len; bool dropped = true; if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); if (!dev) goto drop; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; if (skb_checksum_simple_validate(skb)) goto drop; ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_QUERY: dropped = igmp_heard_query(in_dev, skb, len); break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ if (rt_is_output_route(skb_rtable(skb))) break; /* don't rely on MC router hearing unicast reports */ if (skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) dropped = igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 return pim_rcv_v1(skb); #endif case IGMPV3_HOST_MEMBERSHIP_REPORT: case IGMP_DVMRP: case IGMP_TRACE: case IGMP_HOST_LEAVE_MESSAGE: case IGMP_MTRACE: case IGMP_MTRACE_RESP: break; default: break; } drop: if (dropped) kfree_skb(skb); else consume_skb(skb); return 0; } #endif /* * Add a filter to a device */ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. We will get multicast token leakage, when IFF_MULTICAST is changed. This check should be done in ndo_set_rx_mode routine. Something sort of: if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } --ANK */ if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } /* * Remove a filter from a device */ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } #ifdef CONFIG_IP_MULTICAST /* * deleted ip_mc_list manipulation */ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp) { struct ip_mc_list *pmc; struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ pmc = kzalloc(sizeof(*pmc), gfp); if (!pmc) return; spin_lock_init(&pmc->lock); spin_lock_bh(&im->lock); pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); spin_lock_bh(&in_dev->mc_tomb_lock); pmc->next = in_dev->mc_tomb; in_dev->mc_tomb = pmc; spin_unlock_bh(&in_dev->mc_tomb_lock); } /* * restore ip_mc_list deleted records */ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc, *pmc_prev; struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); __be32 multiaddr = im->multiaddr; spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) pmc_prev->next = pmc->next; else in_dev->mc_tomb = pmc->next; } spin_unlock_bh(&in_dev->mc_tomb_lock); spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; if (im->sfmode == MCAST_INCLUDE) { swap(im->tomb, pmc->tomb); swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } else { im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } in_dev_put(pmc->interface); kfree_pmc(pmc); } spin_unlock_bh(&im->lock); } /* * flush ip_mc_list deleted records */ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; spin_lock_bh(&in_dev->mc_tomb_lock); pmc = in_dev->mc_tomb; in_dev->mc_tomb = NULL; spin_unlock_bh(&in_dev->mc_tomb_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; ip_mc_clear_src(pmc); in_dev_put(pmc->interface); kfree_pmc(pmc); } /* clear dead sources, too */ rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { struct ip_sf_list *psf; spin_lock_bh(&pmc->lock); psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(psf); } rcu_read_unlock(); } #endif static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); int reporter; #endif if (im->loaded) { im->loaded = 0; ip_mc_filter_del(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; reporter = im->reporter; igmp_stop_timer(im); if (!in_dev->dead) { if (IGMP_V1_SEEN(in_dev)) return; if (IGMP_V2_SEEN(in_dev)) { if (reporter) igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); return; } /* IGMPv3 */ igmpv3_add_delrec(in_dev, im, gfp); igmp_ifc_event(in_dev); } #endif } static void igmp_group_dropped(struct ip_mc_list *im) { __igmp_group_dropped(im, GFP_KERNEL); } static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); #endif if (im->loaded == 0) { im->loaded = 1; ip_mc_filter_add(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; if (in_dev->dead) return; im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } /* else, v3 */ /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should * not send filter-mode change record as the mode should be from * IN() to IN(A). */ if (im->sfmode == MCAST_EXCLUDE) im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); igmp_ifc_event(in_dev); #endif } /* * Multicast list managers */ static u32 ip_mc_hash(const struct ip_mc_list *im) { return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG); } static void ip_mc_hash_add(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash; u32 hash; mc_hash = rtnl_dereference(in_dev->mc_hash); if (mc_hash) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; rcu_assign_pointer(mc_hash[hash], im); return; } /* do not use a hash table for small number of items */ if (in_dev->mc_count < 4) return; mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG, GFP_KERNEL); if (!mc_hash) return; for_each_pmc_rtnl(in_dev, im) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; RCU_INIT_POINTER(mc_hash[hash], im); } rcu_assign_pointer(in_dev->mc_hash, mc_hash); } static void ip_mc_hash_remove(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash); struct ip_mc_list *aux; if (!mc_hash) return; mc_hash += ip_mc_hash(im); while ((aux = rtnl_dereference(*mc_hash)) != im) mc_hash = &aux->next_hash; *mc_hash = im->next_hash; } /* * A socket has joined a multicast group on device dev. */ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode, gfp_t gfp) { struct ip_mc_list *im; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) { im->users++; ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } } im = kzalloc(sizeof(*im), gfp); if (!im) goto out; im->users = 1; im->interface = in_dev; in_dev_hold(in_dev); im->multiaddr = addr; /* initial mode is (EX, empty) */ im->sfmode = mode; im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST timer_setup(&im->timer, igmp_timer_expire, 0); #endif im->next_rcu = in_dev->mc_list; in_dev->mc_count++; rcu_assign_pointer(in_dev->mc_list, im); ip_mc_hash_add(in_dev, im); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp); } EXPORT_SYMBOL(__ip_mc_inc_group); void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { __ip_mc_inc_group(in_dev, addr, GFP_KERNEL); } EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) { const struct iphdr *iph; unsigned int len; unsigned int offset = skb_network_offset(skb) + sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) return -EINVAL; offset += ip_hdrlen(skb) - sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) return -EINVAL; len = skb_network_offset(skb) + ntohs(iph->tot_len); if (skb->len < len || len < offset) return -EINVAL; skb_set_transport_header(skb, offset); return 0; } static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb); len += sizeof(struct igmpv3_report); return ip_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ip_mc_check_igmp_query(struct sk_buff *skb) { unsigned int transport_len = ip_transport_len(skb); unsigned int len; /* IGMPv{1,2}? */ if (transport_len != sizeof(struct igmphdr)) { /* or IGMPv3? */ if (transport_len < sizeof(struct igmpv3_query)) return -EINVAL; len = skb_transport_offset(skb) + sizeof(struct igmpv3_query); if (!ip_mc_may_pull(skb, len)) return -EINVAL; } /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer * all-systems destination addresses (224.0.0.1) for general queries */ if (!igmp_hdr(skb)->group && ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) return -EINVAL; return 0; } static int ip_mc_check_igmp_msg(struct sk_buff *skb) { switch (igmp_hdr(skb)->type) { case IGMP_HOST_LEAVE_MESSAGE: case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: return 0; case IGMPV3_HOST_MEMBERSHIP_REPORT: return ip_mc_check_igmp_reportv3(skb); case IGMP_HOST_MEMBERSHIP_QUERY: return ip_mc_check_igmp_query(skb); default: return -ENOMSG; } } static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } static int ip_mc_check_igmp_csum(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); unsigned int transport_len = ip_transport_len(skb); struct sk_buff *skb_chk; if (!ip_mc_may_pull(skb, len)) return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ip_mc_validate_checksum); if (!skb_chk) return -EINVAL; if (skb_chk != skb) kfree_skb(skb_chk); return 0; } /** * ip_mc_check_igmp - checks whether this is a sane IGMP packet * @skb: the skb to validate * * Checks whether an IPv4 packet is a valid IGMP packet. If so sets * skb transport header accordingly and returns zero. * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. * -ENOMEM: A memory allocation failure happened. * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ int ip_mc_check_igmp(struct sk_buff *skb) { int ret = ip_mc_check_iphdr(skb); if (ret < 0) return ret; if (ip_hdr(skb)->protocol != IPPROTO_IGMP) return -ENOMSG; ret = ip_mc_check_igmp_csum(skb); if (ret < 0) return ret; return ip_mc_check_igmp_msg(skb); } EXPORT_SYMBOL(ip_mc_check_igmp); /* * Resend IGMP JOIN report; used by netdev notifier. */ static void ip_mc_rejoin_groups(struct in_device *in_dev) { #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; /* a failover is happening and switches * must be notified immediately */ if (IGMP_V1_SEEN(in_dev)) type = IGMP_HOST_MEMBERSHIP_REPORT; else if (IGMP_V2_SEEN(in_dev)) type = IGMPV2_HOST_MEMBERSHIP_REPORT; else type = IGMPV3_HOST_MEMBERSHIP_REPORT; igmp_send_report(in_dev, im, type); } #endif } /* * A socket has left a multicast group on device dev */ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { struct ip_mc_list *i; struct ip_mc_list __rcu **ip; ASSERT_RTNL(); for (ip = &in_dev->mc_list; (i = rtnl_dereference(*ip)) != NULL; ip = &i->next_rcu) { if (i->multiaddr == addr) { if (--i->users == 0) { ip_mc_hash_remove(in_dev, i); *ip = i->next_rcu; in_dev->mc_count--; __igmp_group_dropped(i, gfp); ip_mc_clear_src(i); if (!in_dev->dead) ip_rt_multicast_event(in_dev); ip_ma_put(i); return; } break; } } } EXPORT_SYMBOL(__ip_mc_dec_group); /* Device changing type */ void ip_mc_unmap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); } void ip_mc_remap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* Device going down */ void ip_mc_down(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); #ifdef CONFIG_IP_MULTICAST WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; if (del_timer(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); #endif ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } #ifdef CONFIG_IP_MULTICAST static void ip_mc_reset(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); in_dev->mr_qi = IGMP_QUERY_INTERVAL; in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv); } #else static void ip_mc_reset(struct in_device *in_dev) { } #endif void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); #endif ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } /* Device going up */ void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* * Device is about to be destroyed: clean up. */ void ip_mc_destroy_dev(struct in_device *in_dev) { struct ip_mc_list *i; ASSERT_RTNL(); /* Deactivate timers */ ip_mc_down(in_dev); #ifdef CONFIG_IP_MULTICAST igmpv3_clear_delrec(in_dev); #endif while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; ip_mc_clear_src(i); ip_ma_put(i); } } /* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { struct net_device *dev = NULL; struct in_device *idev = NULL; if (imr->imr_ifindex) { idev = inetdev_by_index(net, imr->imr_ifindex); return idev; } if (imr->imr_address.s_addr) { dev = __ip_dev_find(net, imr->imr_address.s_addr, false); if (!dev) return NULL; } if (!dev) { struct rtable *rt = ip_route_output(net, imr->imr_multiaddr.s_addr, 0, 0, 0); if (!IS_ERR(rt)) { dev = rt->dst.dev; ip_rt_put(rt); } } if (dev) { imr->imr_ifindex = dev->ifindex; idev = __in_dev_get_rtnl(dev); } return idev; } /* * Join a socket to a group */ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; int rv = 0; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf || psf->sf_count[sfmode] == 0) { /* source filter not found, or count wrong => bug */ return -ESRCH; } psf->sf_count[sfmode]--; if (psf->sf_count[sfmode] == 0) { ip_rt_multicast_event(pmc->interface); } if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ if (psf_prev) psf_prev->sf_next = psf->sf_next; else pmc->sources = psf->sf_next; #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; } else #endif kfree(psf); } return rv; } #ifndef CONFIG_IP_MULTICAST #define igmp_ifc_event(x) do { } while (0) #endif static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int changerec = 0; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif if (!delta) { err = -EINVAL; if (!pmc->sfcount[sfmode]) goto out_unlock; pmc->sfcount[sfmode]--; } err = 0; for (i = 0; i < sfcount; i++) { int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; if (!err && rv < 0) err = rv; } if (pmc->sfmode == MCAST_EXCLUDE && pmc->sfcount[MCAST_EXCLUDE] == 0 && pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { igmp_ifc_event(pmc->interface); #endif } out_unlock: spin_unlock_bh(&pmc->lock); return err; } /* * Add multicast single-source filter to the interface list */ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf) { psf = kzalloc(sizeof(*psf), GFP_ATOMIC); if (!psf) return -ENOBUFS; psf->sf_inaddr = *psfsrc; if (psf_prev) { psf_prev->sf_next = psf; } else pmc->sources = psf; } psf->sf_count[sfmode]++; if (psf->sf_count[sfmode] == 1) { ip_rt_multicast_event(pmc->interface); } return 0; } #ifdef CONFIG_IP_MULTICAST static void sf_markstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; } static int sf_setstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *dpsf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; int qrv = pmc->interface->mr_qrv; int new_in, rv; rv = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else new_in = psf->sf_count[MCAST_INCLUDE] != 0; if (new_in) { if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; } if (dpsf) { if (prev) prev->sf_next = dpsf->sf_next; else pmc->tomb = dpsf->sf_next; kfree(dpsf); } psf->sf_crcount = qrv; rv++; } } else if (psf->sf_oldin) { psf->sf_crcount = 0; /* * add or update "delete" records if an active filter * is now inactive */ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; /* pmc->lock held by callers */ dpsf->sf_next = pmc->tomb; pmc->tomb = dpsf; } dpsf->sf_crcount = qrv; rv++; } } return rv; } #endif /* * Add multicast source filter list to the interface list */ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int isexclude; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif isexclude = pmc->sfmode == MCAST_EXCLUDE; if (!delta) pmc->sfcount[sfmode]++; err = 0; for (i = 0; i < sfcount; i++) { err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; if (!delta) pmc->sfcount[sfmode]--; for (j = 0; j < i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif /* filter mode change */ if (pmc->sfcount[MCAST_EXCLUDE]) pmc->sfmode = MCAST_EXCLUDE; else if (pmc->sfcount[MCAST_INCLUDE]) pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { igmp_ifc_event(in_dev); #endif } spin_unlock_bh(&pmc->lock); return err; } static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *tomb, *sources; spin_lock_bh(&pmc->lock); tomb = pmc->tomb; pmc->tomb = NULL; sources = pmc->sources; pmc->sources = NULL; pmc->sfmode = MCAST_EXCLUDE; pmc->sfcount[MCAST_INCLUDE] = 0; pmc->sfcount[MCAST_EXCLUDE] = 1; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(tomb); ip_sf_list_clear_all(sources); } /* Join a multicast group */ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int ifindex; int count = 0; int err; ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRINUSE; ifindex = imr->imr_ifindex; for_each_pmc_rtnl(inet, i) { if (i->multi.imr_multiaddr.s_addr == addr && i->multi.imr_ifindex == ifindex) goto done; count++; } err = -ENOBUFS; if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships)) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) goto done; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL); err = 0; done: return err; } /* Join ASM (Any-Source Multicast) group */ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ip_mc_join_group); /* Join SSM (Source-Specific Multicast) group */ int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { return __ip_mc_join_group(sk, imr, mode); } static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); int err; if (!psf) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, psf->sl_count, psf->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc); kfree_rcu(psf, rcu); return err; } int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct ip_mc_socklist __rcu **imlp; struct in_device *in_dev; struct net *net = sock_net(sk); __be32 group = imr->imr_multiaddr.s_addr; u32 ifindex; int ret = -EADDRNOTAVAIL; ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; imlp = &iml->next_rcu) { if (iml->multi.imr_multiaddr.s_addr != group) continue; if (ifindex) { if (iml->multi.imr_ifindex != ifindex) continue; } else if (imr->imr_address.s_addr && imr->imr_address.s_addr != iml->multi.imr_address.s_addr) continue; (void) ip_mc_leave_src(sk, iml, in_dev); *imlp = iml->next_rcu; if (in_dev) ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } out: return ret; } EXPORT_SYMBOL(ip_mc_leave_group); int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex) { int err; struct ip_mreqn imr; __be32 addr = mreqs->imr_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev = NULL; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); int leavegroup = 0; int i, j, rv; if (!ipv4_is_multicast(addr)) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if ((pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr) && (pmc->multi.imr_ifindex == imr.imr_ifindex)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } /* if a source filter was set, must be the same mode as before */ if (pmc->sflist) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, NULL, 0); pmc->sfmode = omode; } psl = rtnl_dereference(pmc->sflist); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv) /* source not found */ goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { leavegroup = 1; goto done; } /* update the interface filter */ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; goto done; } /* else, add a new source to the filter */ if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { err = -ENOBUFS; goto done; } if (!psl || psl->sl_count == psl->sl_max) { struct ip_sf_socklist *newpsl; int count = IP_SFBLOCK; if (psl) count += psl->sl_max; newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv == 0) /* address already there is an error */ goto done; for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; err = 0; /* update the interface list */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) { int err = 0; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); int leavegroup = 0; if (!ipv4_is_multicast(addr)) return -EINVAL; if (msf->imsf_fmode != MCAST_INCLUDE && msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) { leavegroup = 1; goto done; } for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } if (msf->imsf_numsrc) { newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, msf->imsf_numsrc), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; memcpy(newpsl->sl_addr, msf->imsf_slist_flex, flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc)); err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } } else { newpsl = NULL; (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, 0, NULL, 0); } psl = rtnl_dereference(pmc->sflist); if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } else { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); pmc->sfmode = msf->imsf_fmode; err = 0; done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen) { int err, len, count, copycount, msf_size; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) /* must have a prior join */ goto done; msf->imsf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); if (!psl) { count = 0; } else { count = psl->sl_count; } copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; msf_size = IP_MSFILTER_SIZE(copycount); if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) || copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && copy_to_sockptr_offset(optval, offsetof(struct ip_msfilter, imsf_slist_flex), psl->sl_addr, len)) return -EFAULT; return 0; done: return err; } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset) { int i, count, copycount; struct sockaddr_in *psin; __be32 addr; struct ip_mc_socklist *pmc; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; ASSERT_RTNL(); psin = (struct sockaddr_in *)&gsf->gf_group; if (psin->sin_family != AF_INET) return -EINVAL; addr = psin->sin_addr.s_addr; if (!ipv4_is_multicast(addr)) return -EINVAL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == addr && pmc->multi.imr_ifindex == gsf->gf_interface) break; } if (!pmc) /* must have a prior join */ return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; ss_offset += sizeof(ss); } return 0; } /* * check if a multicast source filter allows delivery for a given <src,dst,intf> */ int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; int ret; ret = 1; if (!ipv4_is_multicast(loc_addr)) goto out; rcu_read_lock(); for_each_pmc_rcu(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && (pmc->multi.imr_ifindex == dif || (sdif && pmc->multi.imr_ifindex == sdif))) break; } ret = inet_test_bit(MC_ALL, sk); if (!pmc) goto unlock; psl = rcu_dereference(pmc->sflist); ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) goto unlock; for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) goto unlock; ret = 1; unlock: rcu_read_unlock(); out: return ret; } /* * A socket is closing. */ void ip_mc_drop_socket(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct net *net = sock_net(sk); if (!inet->mc_list) return; rtnl_lock(); while ((iml = rtnl_dereference(inet->mc_list)) != NULL) { struct in_device *in_dev; inet->mc_list = iml->next_rcu; in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); if (in_dev) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); } rtnl_unlock(); } /* called with rcu_read_lock() */ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; struct ip_sf_list *psf; int rv = 0; mc_hash = rcu_dereference(in_dev->mc_hash); if (mc_hash) { u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG); for (im = rcu_dereference(mc_hash[hash]); im != NULL; im = rcu_dereference(im->next_hash)) { if (im->multiaddr == mc_addr) break; } } else { for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == mc_addr) break; } } if (im && proto == IPPROTO_IGMP) { rv = 1; } else if (im) { if (src_addr) { spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } if (psf) rv = psf->sf_count[MCAST_INCLUDE] || psf->sf_count[MCAST_EXCLUDE] != im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } return rv; } #if defined(CONFIG_PROC_FS) struct igmp_mc_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *in_dev; }; #define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private) static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_mc_list *im = NULL; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(state->dev); if (!in_dev) continue; im = rcu_dereference(in_dev->mc_list); if (im) { state->in_dev = in_dev; break; } } return im; } static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); im = rcu_dereference(im->next_rcu); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->in_dev = NULL; break; } state->in_dev = __in_dev_get_rcu(state->dev); if (!state->in_dev) continue; im = rcu_dereference(state->in_dev->mc_list); } return im; } static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) { struct ip_mc_list *im = igmp_mc_get_first(seq); if (im) while (pos && (im = igmp_mc_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_mc_list *im; if (v == SEQ_START_TOKEN) im = igmp_mc_get_first(seq); else im = igmp_mc_get_next(seq, v); ++*pos; return im; } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mc_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); else { struct ip_mc_list *im = v; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); char *querier; long delta; #ifdef CONFIG_IP_MULTICAST querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : IGMP_V2_SEEN(state->in_dev) ? "V2" : "V3"; #else querier = "NONE"; #endif if (rcu_access_pointer(state->in_dev->mc_list) == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } delta = im->timer.expires - jiffies; seq_printf(seq, "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_delta_to_clock_t(delta) : 0, im->reporter); } return 0; } static const struct seq_operations igmp_mc_seq_ops = { .start = igmp_mc_seq_start, .next = igmp_mc_seq_next, .stop = igmp_mc_seq_stop, .show = igmp_mc_seq_show, }; struct igmp_mcf_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *idev; struct ip_mc_list *im; }; #define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private) static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_sf_list *psf = NULL; struct ip_mc_list *im = NULL; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); state->idev = NULL; state->im = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *idev; idev = __in_dev_get_rcu(state->dev); if (unlikely(!idev)) continue; im = rcu_dereference(idev->mc_list); if (likely(im)) { spin_lock_bh(&im->lock); psf = im->sources; if (likely(psf)) { state->im = im; state->idev = idev; break; } spin_unlock_bh(&im->lock); } } return psf; } static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); psf = psf->sf_next; while (!psf) { spin_unlock_bh(&state->im->lock); state->im = state->im->next; while (!state->im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } state->idev = __in_dev_get_rcu(state->dev); if (!state->idev) continue; state->im = rcu_dereference(state->idev->mc_list); } spin_lock_bh(&state->im->lock); psf = state->im->sources; } out: return psf; } static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) { struct ip_sf_list *psf = igmp_mcf_get_first(seq); if (psf) while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL) --pos; return pos ? NULL : psf; } static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_sf_list *psf; if (v == SEQ_START_TOKEN) psf = igmp_mcf_get_first(seq); else psf = igmp_mcf_get_next(seq, v); ++*pos; return psf; } static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (likely(state->im)) { spin_unlock_bh(&state->im->lock); state->im = NULL; } state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mcf_seq_show(struct seq_file *seq, void *v) { struct ip_sf_list *psf = v; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " "0x%08x %6lu %6lu\n", state->dev->ifindex, state->dev->name, ntohl(state->im->multiaddr), ntohl(psf->sf_inaddr), psf->sf_count[MCAST_INCLUDE], psf->sf_count[MCAST_EXCLUDE]); } return 0; } static const struct seq_operations igmp_mcf_seq_ops = { .start = igmp_mcf_seq_start, .next = igmp_mcf_seq_next, .stop = igmp_mcf_seq_stop, .show = igmp_mcf_seq_show, }; static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; int err; pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops, sizeof(struct igmp_mc_iter_state)); if (!pde) goto out_igmp; pde = proc_create_net("mcfilter", 0444, net->proc_net, &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state)); if (!pde) goto out_mcfilter; err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, SOCK_DGRAM, 0, net); if (err < 0) { pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", err); goto out_sock; } return 0; out_sock: remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: return -ENOMEM; } static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; #endif static int igmp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; switch (event) { case NETDEV_RESEND_IGMP: in_dev = __in_dev_get_rtnl(dev); if (in_dev) ip_mc_rejoin_groups(in_dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block igmp_notifier = { .notifier_call = igmp_netdev_event, }; int __init igmp_mc_init(void) { #if defined(CONFIG_PROC_FS) int err; err = register_pernet_subsys(&igmp_net_ops); if (err) return err; err = register_netdevice_notifier(&igmp_notifier); if (err) goto reg_notif_fail; return 0; reg_notif_fail: unregister_pernet_subsys(&igmp_net_ops); return err; #else return register_netdevice_notifier(&igmp_notifier); #endif } |
12 12 11 4 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 16 8 7 5 5 8 9 2 2 1 2 4 3 2 2 1 4 2 2 1 1 1 1 2 16 4 13 3 3 1 2 10 10 9 9 8 6 2 1 1 6 10 1 1 1 2 2 1 1 1 3 3 2 2 3 9 8 1 2 2 8 8 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 | // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 - wext compat code * * This is temporary code until all wireless functionality is migrated * into cfg80211, when that happens all the exports here go away and * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2019-2023 Intel Corporation */ #include <linux/export.h> #include <linux/wireless.h> #include <linux/nl80211.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <net/iw_handler.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include "wext-compat.h" #include "core.h" #include "rdev-ops.h" int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { strcpy(wrqu->name, "IEEE 802.11"); return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwname); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; struct vif_params vifparams; enum nl80211_iftype type; int ret; rdev = wiphy_to_rdev(wdev->wiphy); switch (*mode) { case IW_MODE_INFRA: type = NL80211_IFTYPE_STATION; break; case IW_MODE_ADHOC: type = NL80211_IFTYPE_ADHOC; break; case IW_MODE_MONITOR: type = NL80211_IFTYPE_MONITOR; break; default: return -EINVAL; } if (type == wdev->iftype) return 0; memset(&vifparams, 0, sizeof(vifparams)); wiphy_lock(wdev->wiphy); ret = cfg80211_change_iface(rdev, dev, type, &vifparams); wiphy_unlock(wdev->wiphy); return ret; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; switch (wdev->iftype) { case NL80211_IFTYPE_AP: *mode = IW_MODE_MASTER; break; case NL80211_IFTYPE_STATION: *mode = IW_MODE_INFRA; break; case NL80211_IFTYPE_ADHOC: *mode = IW_MODE_ADHOC; break; case NL80211_IFTYPE_MONITOR: *mode = IW_MODE_MONITOR; break; case NL80211_IFTYPE_WDS: *mode = IW_MODE_REPEAT; break; case NL80211_IFTYPE_AP_VLAN: *mode = IW_MODE_SECOND; /* FIXME */ break; default: *mode = IW_MODE_AUTO; break; } return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode); int cfg80211_wext_giwrange(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_range *range = (struct iw_range *) extra; enum nl80211_band band; int i, c = 0; if (!wdev) return -EOPNOTSUPP; data->length = sizeof(struct iw_range); memset(range, 0, sizeof(struct iw_range)); range->we_version_compiled = WIRELESS_EXT; range->we_version_source = 21; range->retry_capa = IW_RETRY_LIMIT; range->retry_flags = IW_RETRY_LIMIT; range->min_retry = 0; range->max_retry = 255; range->min_rts = 0; range->max_rts = 2347; range->min_frag = 256; range->max_frag = 2346; range->max_encoding_tokens = 4; range->max_qual.updated = IW_QUAL_NOISE_INVALID; switch (wdev->wiphy->signal_type) { case CFG80211_SIGNAL_TYPE_NONE: break; case CFG80211_SIGNAL_TYPE_MBM: range->max_qual.level = (u8)-110; range->max_qual.qual = 70; range->avg_qual.qual = 35; range->max_qual.updated |= IW_QUAL_DBM; range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; break; case CFG80211_SIGNAL_TYPE_UNSPEC: range->max_qual.level = 100; range->max_qual.qual = 100; range->avg_qual.qual = 50; range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; break; } range->avg_qual.level = range->max_qual.level / 2; range->avg_qual.noise = range->max_qual.noise / 2; range->avg_qual.updated = range->max_qual.updated; for (i = 0; i < wdev->wiphy->n_cipher_suites; i++) { switch (wdev->wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_TKIP: range->enc_capa |= (IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_WPA); break; case WLAN_CIPHER_SUITE_CCMP: range->enc_capa |= (IW_ENC_CAPA_CIPHER_CCMP | IW_ENC_CAPA_WPA2); break; case WLAN_CIPHER_SUITE_WEP40: range->encoding_size[range->num_encoding_sizes++] = WLAN_KEY_LEN_WEP40; break; case WLAN_CIPHER_SUITE_WEP104: range->encoding_size[range->num_encoding_sizes++] = WLAN_KEY_LEN_WEP104; break; } } for (band = 0; band < NUM_NL80211_BANDS; band ++) { struct ieee80211_supported_band *sband; sband = wdev->wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { range->freq[c].i = ieee80211_frequency_to_channel( chan->center_freq); range->freq[c].m = chan->center_freq; range->freq[c].e = 6; c++; } } } range->num_channels = c; range->num_frequency = c; IW_EVENT_CAPA_SET_KERNEL(range->event_capa); IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); if (wdev->wiphy->max_scan_ssids > 0) range->scan_capa |= IW_SCAN_CAPA_ESSID; return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwrange); /** * cfg80211_wext_freq - get wext frequency for non-"auto" * @freq: the wext freq encoding * * Returns: a frequency, or a negative error code, or 0 for auto. */ int cfg80211_wext_freq(struct iw_freq *freq) { /* * Parse frequency - return 0 for auto and * -EINVAL for impossible things. */ if (freq->e == 0) { enum nl80211_band band = NL80211_BAND_2GHZ; if (freq->m < 0) return 0; if (freq->m > 14) band = NL80211_BAND_5GHZ; return ieee80211_channel_to_frequency(freq->m, band); } else { int i, div = 1000000; for (i = 0; i < freq->e; i++) div /= 10; if (div <= 0) return -EINVAL; return freq->m / div; } } int cfg80211_wext_siwrts(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 orts = wdev->wiphy->rts_threshold; int err; wiphy_lock(&rdev->wiphy); if (rts->disabled || !rts->fixed) { wdev->wiphy->rts_threshold = (u32) -1; } else if (rts->value < 0) { err = -EINVAL; goto out; } else { wdev->wiphy->rts_threshold = rts->value; } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); if (err) wdev->wiphy->rts_threshold = orts; out: wiphy_unlock(&rdev->wiphy); return err; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; rts->value = wdev->wiphy->rts_threshold; rts->disabled = rts->value == (u32) -1; rts->fixed = 1; return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 ofrag = wdev->wiphy->frag_threshold; int err; wiphy_lock(&rdev->wiphy); if (frag->disabled || !frag->fixed) { wdev->wiphy->frag_threshold = (u32) -1; } else if (frag->value < 256) { err = -EINVAL; goto out; } else { /* Fragment length must be even, so strip LSB. */ wdev->wiphy->frag_threshold = frag->value & ~0x1; } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; out: wiphy_unlock(&rdev->wiphy); return err; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; frag->value = wdev->wiphy->frag_threshold; frag->disabled = frag->value == (u32) -1; frag->fixed = 1; return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag); static int cfg80211_wext_siwretry(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 changed = 0; u8 olong = wdev->wiphy->retry_long; u8 oshort = wdev->wiphy->retry_short; int err; if (retry->disabled || retry->value < 1 || retry->value > 255 || (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; wiphy_lock(&rdev->wiphy); if (retry->flags & IW_RETRY_LONG) { wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; } else if (retry->flags & IW_RETRY_SHORT) { wdev->wiphy->retry_short = retry->value; changed |= WIPHY_PARAM_RETRY_SHORT; } else { wdev->wiphy->retry_short = retry->value; wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; changed |= WIPHY_PARAM_RETRY_SHORT; } err = rdev_set_wiphy_params(rdev, changed); if (err) { wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; } wiphy_unlock(&rdev->wiphy); return err; } int cfg80211_wext_giwretry(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; retry->disabled = 0; if (retry->flags == 0 || (retry->flags & IW_RETRY_SHORT)) { /* * First return short value, iwconfig will ask long value * later if needed */ retry->flags |= IW_RETRY_LIMIT | IW_RETRY_SHORT; retry->value = wdev->wiphy->retry_short; if (wdev->wiphy->retry_long == wdev->wiphy->retry_short) retry->flags |= IW_RETRY_LONG; return 0; } if (retry->flags & IW_RETRY_LONG) { retry->flags = IW_RETRY_LIMIT | IW_RETRY_LONG; retry->value = wdev->wiphy->retry_long; } return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwretry); static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, struct net_device *dev, bool pairwise, const u8 *addr, bool remove, bool tx_key, int idx, struct key_params *params) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err, i; bool rejoin = false; if (wdev->valid_links) return -EINVAL; if (pairwise && !addr) return -EINVAL; /* * In many cases we won't actually need this, but it's better * to do it first in case the allocation fails. Don't use wext. */ if (!wdev->wext.keys) { wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), GFP_KERNEL); if (!wdev->wext.keys) return -ENOMEM; for (i = 0; i < 4; i++) wdev->wext.keys->params[i].key = wdev->wext.keys->data[i]; } if (wdev->iftype != NL80211_IFTYPE_ADHOC && wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) { if (!wdev->connected) return -ENOLINK; if (!rdev->ops->set_default_mgmt_key) return -EOPNOTSUPP; if (idx < 4 || idx > 5) return -EINVAL; } else if (idx < 0 || idx > 3) return -EINVAL; if (remove) { err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) { /* * If removing the current TX key, we will need to * join a new IBSS without the privacy bit clear. */ if (idx == wdev->wext.default_key && wdev->iftype == NL80211_IFTYPE_ADHOC) { cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } if (!pairwise && addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) err = -ENOENT; else err = rdev_del_key(rdev, dev, -1, idx, pairwise, addr); } wdev->wext.connect.privacy = false; /* * Applications using wireless extensions expect to be * able to delete keys that don't exist, so allow that. */ if (err == -ENOENT) err = 0; if (!err) { if (!addr && idx < 4) { memset(wdev->wext.keys->data[idx], 0, sizeof(wdev->wext.keys->data[idx])); wdev->wext.keys->params[idx].key_len = 0; wdev->wext.keys->params[idx].cipher = 0; } if (idx == wdev->wext.default_key) wdev->wext.default_key = -1; else if (idx == wdev->wext.default_mgmt_key) wdev->wext.default_mgmt_key = -1; } if (!err && rejoin) err = cfg80211_ibss_wext_join(rdev, wdev); return err; } if (addr) tx_key = false; if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr)) return -EINVAL; err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_add_key(rdev, dev, -1, idx, pairwise, addr, params); else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 && params->cipher != WLAN_CIPHER_SUITE_WEP104) return -EINVAL; if (err) return err; /* * We only need to store WEP keys, since they're the only keys that * can be set before a connection is established and persist after * disconnecting. */ if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 || params->cipher == WLAN_CIPHER_SUITE_WEP104)) { wdev->wext.keys->params[idx] = *params; memcpy(wdev->wext.keys->data[idx], params->key, params->key_len); wdev->wext.keys->params[idx].key = wdev->wext.keys->data[idx]; } if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 || params->cipher == WLAN_CIPHER_SUITE_WEP104) && (tx_key || (!addr && wdev->wext.default_key == -1))) { if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) { /* * If we are getting a new TX key from not having * had one before we need to join a new IBSS with * the privacy bit set. */ if (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->wext.default_key == -1) { cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } err = rdev_set_default_key(rdev, dev, -1, idx, true, true); } if (!err) { wdev->wext.default_key = idx; if (rejoin) err = cfg80211_ibss_wext_join(rdev, wdev); } return err; } if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC && (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) { if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_set_default_mgmt_key(rdev, dev, -1, idx); if (!err) wdev->wext.default_mgmt_key = idx; return err; } return 0; } static int cfg80211_wext_siwencode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *keybuf) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int idx, err; bool remove = false; struct key_params params; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; /* no use -- only MFP (set_default_mgmt_key) is optional */ if (!rdev->ops->del_key || !rdev->ops->add_key || !rdev->ops->set_default_key) return -EOPNOTSUPP; wiphy_lock(&rdev->wiphy); if (wdev->valid_links) { err = -EOPNOTSUPP; goto out; } idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) { err = -EINVAL; goto out; } else { idx--; } if (erq->flags & IW_ENCODE_DISABLED) remove = true; else if (erq->length == 0) { /* No key data - just set the default TX key index */ err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_set_default_key(rdev, dev, -1, idx, true, true); if (!err) wdev->wext.default_key = idx; goto out; } memset(¶ms, 0, sizeof(params)); params.key = keybuf; params.key_len = erq->length; if (erq->length == 5) { params.cipher = WLAN_CIPHER_SUITE_WEP40; } else if (erq->length == 13) { params.cipher = WLAN_CIPHER_SUITE_WEP104; } else if (!remove) { err = -EINVAL; goto out; } err = cfg80211_set_encryption(rdev, dev, false, NULL, remove, wdev->wext.default_key == -1, idx, ¶ms); out: wiphy_unlock(&rdev->wiphy); return err; } static int cfg80211_wext_siwencodeext(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct iw_encode_ext *ext = (struct iw_encode_ext *) extra; const u8 *addr; int idx; bool remove = false; struct key_params params; u32 cipher; int ret; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; /* no use -- only MFP (set_default_mgmt_key) is optional */ if (!rdev->ops->del_key || !rdev->ops->add_key || !rdev->ops->set_default_key) return -EOPNOTSUPP; if (wdev->valid_links) return -EOPNOTSUPP; switch (ext->alg) { case IW_ENCODE_ALG_NONE: remove = true; cipher = 0; break; case IW_ENCODE_ALG_WEP: if (ext->key_len == 5) cipher = WLAN_CIPHER_SUITE_WEP40; else if (ext->key_len == 13) cipher = WLAN_CIPHER_SUITE_WEP104; else return -EINVAL; break; case IW_ENCODE_ALG_TKIP: cipher = WLAN_CIPHER_SUITE_TKIP; break; case IW_ENCODE_ALG_CCMP: cipher = WLAN_CIPHER_SUITE_CCMP; break; case IW_ENCODE_ALG_AES_CMAC: cipher = WLAN_CIPHER_SUITE_AES_CMAC; break; default: return -EOPNOTSUPP; } if (erq->flags & IW_ENCODE_DISABLED) remove = true; idx = erq->flags & IW_ENCODE_INDEX; if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) { if (idx < 4 || idx > 5) { idx = wdev->wext.default_mgmt_key; if (idx < 0) return -EINVAL; } else idx--; } else { if (idx < 1 || idx > 4) { idx = wdev->wext.default_key; if (idx < 0) return -EINVAL; } else idx--; } addr = ext->addr.sa_data; if (is_broadcast_ether_addr(addr)) addr = NULL; memset(¶ms, 0, sizeof(params)); params.key = ext->key; params.key_len = ext->key_len; params.cipher = cipher; if (ext->ext_flags & IW_ENCODE_EXT_RX_SEQ_VALID) { params.seq = ext->rx_seq; params.seq_len = 6; } wiphy_lock(wdev->wiphy); ret = cfg80211_set_encryption( rdev, dev, !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), addr, remove, ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, idx, ¶ms); wiphy_unlock(wdev->wiphy); return ret; } static int cfg80211_wext_giwencode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *keybuf) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; int idx; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) return -EINVAL; else idx--; erq->flags = idx + 1; if (!wdev->wext.keys || !wdev->wext.keys->params[idx].cipher) { erq->flags |= IW_ENCODE_DISABLED; erq->length = 0; return 0; } erq->length = min_t(size_t, erq->length, wdev->wext.keys->params[idx].key_len); memcpy(keybuf, wdev->wext.keys->params[idx].key, erq->length); erq->flags |= IW_ENCODE_ENABLED; return 0; } static int cfg80211_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_freq *wextfreq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; int freq, ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); break; case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); break; case NL80211_IFTYPE_MONITOR: freq = cfg80211_wext_freq(wextfreq); if (freq < 0) { ret = freq; break; } if (freq == 0) { ret = -EINVAL; break; } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) { ret = -EINVAL; break; } ret = cfg80211_set_monitor_channel(rdev, &chandef); break; case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); if (freq < 0) { ret = freq; break; } if (freq == 0) { ret = -EINVAL; break; } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) { ret = -EINVAL; break; } ret = cfg80211_set_mesh_channel(rdev, wdev, &chandef); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_freq *freq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = {}; int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); break; case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); break; case NL80211_IFTYPE_MONITOR: if (!rdev->ops->get_channel) { ret = -EINVAL; break; } ret = rdev_get_channel(rdev, wdev, 0, &chandef); if (ret) break; freq->m = chandef.chan->center_freq; freq->e = 6; ret = 0; break; default: ret = -EINVAL; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_siwtxpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; int ret; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; if (data->txpower.flags & IW_TXPOW_RANGE) return -EINVAL; if (!rdev->ops->set_tx_power) return -EOPNOTSUPP; /* only change when not disabling */ if (!data->txpower.disabled) { rfkill_set_sw_state(rdev->wiphy.rfkill, false); if (data->txpower.fixed) { /* * wext doesn't support negative values, see * below where it's for automatic */ if (data->txpower.value < 0) return -EINVAL; dbm = data->txpower.value; type = NL80211_TX_POWER_FIXED; /* TODO: do regulatory check! */ } else { /* * Automatic power level setting, max being the value * passed in from userland. */ if (data->txpower.value < 0) { type = NL80211_TX_POWER_AUTOMATIC; } else { dbm = data->txpower.value; type = NL80211_TX_POWER_LIMITED; } } } else { if (rfkill_set_sw_state(rdev->wiphy.rfkill, true)) schedule_work(&rdev->rfkill_block); return 0; } wiphy_lock(&rdev->wiphy); ret = rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwtxpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err, val; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; if (data->txpower.flags & IW_TXPOW_RANGE) return -EINVAL; if (!rdev->ops->get_tx_power) return -EOPNOTSUPP; wiphy_lock(&rdev->wiphy); err = rdev_get_tx_power(rdev, wdev, &val); wiphy_unlock(&rdev->wiphy); if (err) return err; /* well... oh well */ data->txpower.fixed = 1; data->txpower.disabled = rfkill_blocked(rdev->wiphy.rfkill); data->txpower.value = val; data->txpower.flags = IW_TXPOW_DBM; return 0; } static int cfg80211_set_auth_alg(struct wireless_dev *wdev, s32 auth_alg) { int nr_alg = 0; if (!auth_alg) return -EINVAL; if (auth_alg & ~(IW_AUTH_ALG_OPEN_SYSTEM | IW_AUTH_ALG_SHARED_KEY | IW_AUTH_ALG_LEAP)) return -EINVAL; if (auth_alg & IW_AUTH_ALG_OPEN_SYSTEM) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; } if (auth_alg & IW_AUTH_ALG_SHARED_KEY) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_SHARED_KEY; } if (auth_alg & IW_AUTH_ALG_LEAP) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; } if (nr_alg > 1) wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; return 0; } static int cfg80211_set_wpa_version(struct wireless_dev *wdev, u32 wpa_versions) { if (wpa_versions & ~(IW_AUTH_WPA_VERSION_WPA | IW_AUTH_WPA_VERSION_WPA2| IW_AUTH_WPA_VERSION_DISABLED)) return -EINVAL; if ((wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) && (wpa_versions & (IW_AUTH_WPA_VERSION_WPA| IW_AUTH_WPA_VERSION_WPA2))) return -EINVAL; if (wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) wdev->wext.connect.crypto.wpa_versions &= ~(NL80211_WPA_VERSION_1|NL80211_WPA_VERSION_2); if (wpa_versions & IW_AUTH_WPA_VERSION_WPA) wdev->wext.connect.crypto.wpa_versions |= NL80211_WPA_VERSION_1; if (wpa_versions & IW_AUTH_WPA_VERSION_WPA2) wdev->wext.connect.crypto.wpa_versions |= NL80211_WPA_VERSION_2; return 0; } static int cfg80211_set_cipher_group(struct wireless_dev *wdev, u32 cipher) { if (cipher & IW_AUTH_CIPHER_WEP40) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_WEP40; else if (cipher & IW_AUTH_CIPHER_WEP104) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_WEP104; else if (cipher & IW_AUTH_CIPHER_TKIP) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_TKIP; else if (cipher & IW_AUTH_CIPHER_CCMP) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_CCMP; else if (cipher & IW_AUTH_CIPHER_AES_CMAC) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_AES_CMAC; else if (cipher & IW_AUTH_CIPHER_NONE) wdev->wext.connect.crypto.cipher_group = 0; else return -EINVAL; return 0; } static int cfg80211_set_cipher_pairwise(struct wireless_dev *wdev, u32 cipher) { int nr_ciphers = 0; u32 *ciphers_pairwise = wdev->wext.connect.crypto.ciphers_pairwise; if (cipher & IW_AUTH_CIPHER_WEP40) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP40; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_WEP104) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP104; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_TKIP) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_TKIP; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_CCMP) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_CCMP; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_AES_CMAC) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_AES_CMAC; nr_ciphers++; } BUILD_BUG_ON(NL80211_MAX_NR_CIPHER_SUITES < 5); wdev->wext.connect.crypto.n_ciphers_pairwise = nr_ciphers; return 0; } static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt) { int nr_akm_suites = 0; if (key_mgt & ~(IW_AUTH_KEY_MGMT_802_1X | IW_AUTH_KEY_MGMT_PSK)) return -EINVAL; if (key_mgt & IW_AUTH_KEY_MGMT_802_1X) { wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = WLAN_AKM_SUITE_8021X; nr_akm_suites++; } if (key_mgt & IW_AUTH_KEY_MGMT_PSK) { wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = WLAN_AKM_SUITE_PSK; nr_akm_suites++; } wdev->wext.connect.crypto.n_akm_suites = nr_akm_suites; return 0; } static int cfg80211_wext_siwauth(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *data = &wrqu->param; struct wireless_dev *wdev = dev->ieee80211_ptr; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; switch (data->flags & IW_AUTH_INDEX) { case IW_AUTH_PRIVACY_INVOKED: wdev->wext.connect.privacy = data->value; return 0; case IW_AUTH_WPA_VERSION: return cfg80211_set_wpa_version(wdev, data->value); case IW_AUTH_CIPHER_GROUP: return cfg80211_set_cipher_group(wdev, data->value); case IW_AUTH_KEY_MGMT: return cfg80211_set_key_mgt(wdev, data->value); case IW_AUTH_CIPHER_PAIRWISE: return cfg80211_set_cipher_pairwise(wdev, data->value); case IW_AUTH_80211_AUTH_ALG: return cfg80211_set_auth_alg(wdev, data->value); case IW_AUTH_WPA_ENABLED: case IW_AUTH_RX_UNENCRYPTED_EAPOL: case IW_AUTH_DROP_UNENCRYPTED: case IW_AUTH_MFP: return 0; default: return -EOPNOTSUPP; } } static int cfg80211_wext_giwauth(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { /* XXX: what do we need? */ return -EOPNOTSUPP; } static int cfg80211_wext_siwpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); bool ps; int timeout = wdev->ps_timeout; int err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; if (!rdev->ops->set_power_mgmt) return -EOPNOTSUPP; if (wrq->disabled) { ps = false; } else { switch (wrq->flags & IW_POWER_MODE) { case IW_POWER_ON: /* If not specified */ case IW_POWER_MODE: /* If set all mask */ case IW_POWER_ALL_R: /* If explicitely state all */ ps = true; break; default: /* Otherwise we ignore */ return -EINVAL; } if (wrq->flags & ~(IW_POWER_MODE | IW_POWER_TIMEOUT)) return -EINVAL; if (wrq->flags & IW_POWER_TIMEOUT) timeout = wrq->value / 1000; } wiphy_lock(&rdev->wiphy); err = rdev_set_power_mgmt(rdev, dev, ps, timeout); wiphy_unlock(&rdev->wiphy); if (err) return err; wdev->ps = ps; wdev->ps_timeout = timeout; return 0; } static int cfg80211_wext_giwpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; wrq->disabled = !wdev->ps; return 0; } static int cfg80211_wext_siwrate(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; int band, ridx, ret; bool match = false; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; memset(&mask, 0, sizeof(mask)); fixed = 0; maxrate = (u32)-1; if (rate->value < 0) { /* nothing */ } else if (rate->fixed) { fixed = rate->value / 100000; } else { maxrate = rate->value / 100000; } for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wdev->wiphy->bands[band]; if (sband == NULL) continue; for (ridx = 0; ridx < sband->n_bitrates; ridx++) { struct ieee80211_rate *srate = &sband->bitrates[ridx]; if (fixed == srate->bitrate) { mask.control[band].legacy = 1 << ridx; match = true; break; } if (srate->bitrate <= maxrate) { mask.control[band].legacy |= 1 << ridx; match = true; } } } if (!match) return -EINVAL; wiphy_lock(&rdev->wiphy); if (dev->ieee80211_ptr->valid_links) ret = -EOPNOTSUPP; else ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwrate(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct station_info sinfo = {}; u8 addr[ETH_ALEN]; int err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!rdev->ops->get_station) return -EOPNOTSUPP; err = 0; if (!wdev->valid_links && wdev->links[0].client.current_bss) memcpy(addr, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); else err = -EOPNOTSUPP; if (err) return err; wiphy_lock(&rdev->wiphy); err = rdev_get_station(rdev, dev, addr, &sinfo); wiphy_unlock(&rdev->wiphy); if (err) return err; if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { err = -EOPNOTSUPP; goto free; } rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); free: cfg80211_sinfo_release_content(&sinfo); return err; } /* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; int ret; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) return NULL; if (!rdev->ops->get_station) return NULL; /* Grab BSSID of current BSS, if any */ wiphy_lock(&rdev->wiphy); if (wdev->valid_links || !wdev->links[0].client.current_bss) { wiphy_unlock(&rdev->wiphy); return NULL; } memcpy(bssid, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); memset(&sinfo, 0, sizeof(sinfo)); ret = rdev_get_station(rdev, dev, bssid, &sinfo); wiphy_unlock(&rdev->wiphy); if (ret) return NULL; memset(&wstats, 0, sizeof(wstats)); switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { int sig = sinfo.signal; wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.updated |= IW_QUAL_DBM; wstats.qual.level = sig; if (sig < -110) sig = -110; else if (sig > -40) sig = -40; wstats.qual.qual = sig + 110; break; } fallthrough; case CFG80211_SIGNAL_TYPE_UNSPEC: if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.level = sinfo.signal; wstats.qual.qual = sinfo.signal; break; } fallthrough; default: wstats.qual.updated |= IW_QUAL_LEVEL_INVALID; wstats.qual.updated |= IW_QUAL_QUAL_INVALID; } wstats.qual.updated |= IW_QUAL_NOISE_INVALID; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC)) wstats.discard.misc = sinfo.rx_dropped_misc; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; cfg80211_sinfo_release_content(&sinfo); return &wstats; } static int cfg80211_wext_siwap(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwap(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *ssid) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_siwessid(dev, info, data, ssid); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_siwessid(dev, info, data, ssid); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwessid(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *ssid) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; data->flags = 0; data->length = 0; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_giwessid(dev, info, data, ssid); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_giwessid(dev, info, data, ssid); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_siwpmksa(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmksa cfg_pmksa; struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; int ret; memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa)); if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; cfg_pmksa.bssid = pmksa->bssid.sa_data; cfg_pmksa.pmkid = pmksa->pmkid; wiphy_lock(&rdev->wiphy); switch (pmksa->cmd) { case IW_PMKSA_ADD: if (!rdev->ops->set_pmksa) { ret = -EOPNOTSUPP; break; } ret = rdev_set_pmksa(rdev, dev, &cfg_pmksa); break; case IW_PMKSA_REMOVE: if (!rdev->ops->del_pmksa) { ret = -EOPNOTSUPP; break; } ret = rdev_del_pmksa(rdev, dev, &cfg_pmksa); break; case IW_PMKSA_FLUSH: if (!rdev->ops->flush_pmksa) { ret = -EOPNOTSUPP; break; } ret = rdev_flush_pmksa(rdev, dev); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static const iw_handler cfg80211_handlers[] = { IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname), IW_HANDLER(SIOCSIWFREQ, cfg80211_wext_siwfreq), IW_HANDLER(SIOCGIWFREQ, cfg80211_wext_giwfreq), IW_HANDLER(SIOCSIWMODE, cfg80211_wext_siwmode), IW_HANDLER(SIOCGIWMODE, cfg80211_wext_giwmode), IW_HANDLER(SIOCGIWRANGE, cfg80211_wext_giwrange), IW_HANDLER(SIOCSIWAP, cfg80211_wext_siwap), IW_HANDLER(SIOCGIWAP, cfg80211_wext_giwap), IW_HANDLER(SIOCSIWMLME, cfg80211_wext_siwmlme), IW_HANDLER(SIOCSIWSCAN, cfg80211_wext_siwscan), IW_HANDLER(SIOCGIWSCAN, cfg80211_wext_giwscan), IW_HANDLER(SIOCSIWESSID, cfg80211_wext_siwessid), IW_HANDLER(SIOCGIWESSID, cfg80211_wext_giwessid), IW_HANDLER(SIOCSIWRATE, cfg80211_wext_siwrate), IW_HANDLER(SIOCGIWRATE, cfg80211_wext_giwrate), IW_HANDLER(SIOCSIWRTS, cfg80211_wext_siwrts), IW_HANDLER(SIOCGIWRTS, cfg80211_wext_giwrts), IW_HANDLER(SIOCSIWFRAG, cfg80211_wext_siwfrag), IW_HANDLER(SIOCGIWFRAG, cfg80211_wext_giwfrag), IW_HANDLER(SIOCSIWTXPOW, cfg80211_wext_siwtxpower), IW_HANDLER(SIOCGIWTXPOW, cfg80211_wext_giwtxpower), IW_HANDLER(SIOCSIWRETRY, cfg80211_wext_siwretry), IW_HANDLER(SIOCGIWRETRY, cfg80211_wext_giwretry), IW_HANDLER(SIOCSIWENCODE, cfg80211_wext_siwencode), IW_HANDLER(SIOCGIWENCODE, cfg80211_wext_giwencode), IW_HANDLER(SIOCSIWPOWER, cfg80211_wext_siwpower), IW_HANDLER(SIOCGIWPOWER, cfg80211_wext_giwpower), IW_HANDLER(SIOCSIWGENIE, cfg80211_wext_siwgenie), IW_HANDLER(SIOCSIWAUTH, cfg80211_wext_siwauth), IW_HANDLER(SIOCGIWAUTH, cfg80211_wext_giwauth), IW_HANDLER(SIOCSIWENCODEEXT, cfg80211_wext_siwencodeext), IW_HANDLER(SIOCSIWPMKSA, cfg80211_wext_siwpmksa), }; const struct iw_handler_def cfg80211_wext_handler = { .num_standard = ARRAY_SIZE(cfg80211_handlers), .standard = cfg80211_handlers, .get_wireless_stats = cfg80211_wireless_stats, }; |
2070 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. */ struct shared_policy { struct rb_root root; rwlock_t lock; }; struct sp_node { struct rb_node nd; pgoff_t start, end; struct mempolicy *policy; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else struct mempolicy {}; static inline struct mempolicy *get_task_policy(struct task_struct *p) { return NULL; } static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *pol) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { *ilx = 0; return NULL; } static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; } #endif /* CONFIG_NUMA */ #endif |
10621 57 1858 2277 16677 1277 2902 6980 11940 6440 3120 382 2884 11650 415 2932 3063 52 2 2 201 128 6522 11572 6404 1082 5333 14 2471 301 40 9660 12147 104 2497 121 121 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/container_of.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <asm/barrier.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_LIST_HARDENED #ifdef CONFIG_DEBUG_LIST # define __list_valid_slowpath #else # define __list_valid_slowpath __cold __preserve_most #endif /* * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_add_valid_or_report(). */ static __always_inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { /* * With the hardening version, elide checking if next and prev * are NULL, since the immediate dereference of them below would * result in a fault if NULL. * * With the reduced set of checks, we can afford to inline the * checks, which also gives the compiler a chance to elide some * of them completely if they can be proven at compile-time. If * one of the pre-conditions does not hold, the slow-path will * show a report which pre-condition failed. */ if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) return true; ret = false; } ret &= __list_add_valid_or_report(new, prev, next); return ret; } /* * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_del_entry_valid_or_report(). */ static __always_inline bool __list_del_entry_valid(struct list_head *entry) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { struct list_head *prev = entry->prev; struct list_head *next = entry->next; /* * With the hardening version, elide checking if next and prev * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate * dereference of them below would result in a fault. */ if (likely(prev->next == entry && next->prev == entry)) return true; ret = false; } ret &= __list_del_entry_valid_or_report(entry); return ret; } #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_reverse - iterate backwards over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_reverse(pos, head) \ for (pos = (head)->prev; pos != (head); pos = pos->prev) /** * list_for_each_rcu - Iterate over a list in an RCU-safe fashion * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ !list_is_head(pos, (head)); \ pos = rcu_dereference(pos->next)) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ (&pos->member == (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } /** * hlist_splice_init() - move all entries from one list to another * @from: hlist_head from which entries will be moved * @last: last entry on the @from list * @to: hlist_head to which entries will be moved * * @to can be empty, @from must contain at least @last. */ static inline void hlist_splice_init(struct hlist_head *from, struct hlist_node *last, struct hlist_head *to) { if (to->first) to->first->pprev = &last->next; last->next = to->first; to->first = from->first; from->first->pprev = &to->first; from->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) #endif |
2246 2246 2246 58 58 58 58 58 58 58 57 58 58 58 58 58 58 3 3 58 58 58 3 58 58 75 75 75 75 75 75 75 75 75 75 75 75 75 75 74 74 75 75 75 75 10 10 5 10 10 10 10 2124 2124 2124 2124 2124 2211 2211 2211 2195 2195 2211 2211 2211 2211 2211 2211 20 2195 2211 2211 2211 2211 2211 2195 2195 2195 2211 2195 20 20 2195 75 75 2211 2211 2211 2211 2210 2211 2211 2211 2211 2211 2211 2195 2211 2211 214 214 2211 2211 2211 2210 2211 2211 2211 2211 2211 2211 2211 2211 75 75 2211 2211 2210 2211 75 2211 2211 2211 75 2211 2211 2211 2210 2211 2211 2211 2211 2211 2211 2211 2211 2211 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ialloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * BSD ufs-inspired inode and directory allocation by * Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/fs.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/cred.h> #include <asm/byteorder.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * ialloc.c contains the inodes allocation and deallocation routines */ /* * The free inodes are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. */ /* * To avoid calling the atomic setbit hundreds or thousands of times, we only * need to use it within a single byte (to ensure we get endianness right). * We can use memset for the rest of the bitmap as there are no other users. */ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) { int i; if (start_bit >= end_bit) return; ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) ext4_set_bit(i, bitmap); if (i < end_bit) memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); } void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); set_bitmap_uptodate(bh); } unlock_buffer(bh); put_bh(bh); } static int ext4_validate_inode_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, EXT4_INODES_PER_GROUP(sb) / 8) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. * * Return buffer_head of bitmap on success, or an ERR_PTR on error. */ static struct buffer_head * ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Inode bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } verify: err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* * NOTE! When we get the inode, we're the only people * that have access to it, and as such there are no * race conditions we have to worry about. The inode * is not on the hash-lists, and it cannot be reached * through the filesystem because the directory entry * has been deleted earlier. * * HOWEVER: we must make sure that we get no aliases, * which means that we have to call "clear_inode()" * _before_ we mark the inode not in use in the inode * bitmaps. Otherwise a newly created file might use * the same inode number (not actually the same pointer * though), and then we'd have two inodes sharing the * same inode number and space on the harddisk. */ void ext4_free_inode(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; int is_directory; unsigned long ino; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t block_group; unsigned long bit; struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " "nonexistent device\n", __func__, __LINE__); return; } if (atomic_read(&inode->i_count) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, atomic_read(&inode->i_count)); return; } if (inode->i_nlink) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); dquot_initialize(inode); dquot_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); /* Do this BEFORE marking the inode not in use or returning an error */ ext4_clear_inode(inode); es = sbi->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); /* Don't bother if the inode bitmap is corrupt. */ if (IS_ERR(bitmap_bh)) { fatal = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, block_group); if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { fatal = -EFSCORRUPTED; goto error_return; } } BUFFER_TRACE(bitmap_bh, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (fatal) goto error_return; fatal = -ESRCH; gdp = ext4_get_group_desc(sb, block_group, &bh2); if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bh2, EXT4_JTR_NONE); } ext4_lock_group(sb, block_group); cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; } count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) percpu_counter_inc(&sbi->s_freeinodes_counter); if (sbi->s_log_groups_per_flex) { struct flex_groups *fg; fg = sbi_array_rcu_deref(sbi, s_flex_groups, ext4_flex_group(sbi, block_group)); atomic_inc(&fg->free_inodes); if (is_directory) atomic_dec(&fg->used_dirs); } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); out: if (cleared) { BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: brelse(bitmap_bh); ext4_std_error(sb, fatal); } struct orlov_stats { __u64 free_clusters; __u32 free_inodes; __u32 used_dirs; }; /* * Helper function for Orlov's allocator; returns critical information * for a particular block group or flex_bg. If flex_size is 1, then g * is a block group number; otherwise it is flex_bg number. */ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; if (flex_size > 1) { struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), s_flex_groups, g); stats->free_inodes = atomic_read(&fg->free_inodes); stats->free_clusters = atomic64_read(&fg->free_clusters); stats->used_dirs = atomic_read(&fg->used_dirs); return; } desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; stats->free_clusters = 0; stats->used_dirs = 0; } } /* * Orlov's allocator for directories. * * We always try to spread first-level directories. * * If there are blockgroups with both free inodes and free clusters counts * not worse than average we return one with smallest directory count. * Otherwise we simply return a random group. * * For the rest rules look so: * * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free clusters left (min_clusters) or * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). */ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode, const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei, grp_free; ext4_fsblk_t freec, avefreec; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); struct dx_hash_info hinfo; ngroups = real_ngroups; if (flex_size > 1) { ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); avefreec = freec; do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && ((parent == d_inode(sb->s_root)) || (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; if (qstr) { hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); parent_group = hinfo.hash % ngroups; } else parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); if (!stats.free_inodes) continue; if (stats.used_dirs >= best_ndir) continue; if (stats.free_inodes < avefreei) continue; if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; best_ndir = stats.used_dirs; } if (ret) goto fallback; found_flex_bg: if (flex_size == 1) { *group = grp; return 0; } /* * We pack inodes at the beginning of the flexgroup's * inode tables. Block allocation decisions will do * something similar, although regular files will * start at 2nd block group of the flexgroup. See * ext4_ext_find_goal() and ext4_find_near(). */ grp *= flex_size; for (i = 0; i < flex_size; i++) { if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = grp+i; return 0; } } goto fallback; } max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; /* * Start looking in the flex group where we last allocated an * inode for this parent directory */ if (EXT4_I(parent)->i_last_alloc_group != ~0) { parent_group = EXT4_I(parent)->i_last_alloc_group; if (flex_size > 1) parent_group >>= sbi->s_log_groups_per_flex; } for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; get_orlov_stats(sb, grp, flex_size, &stats); if (stats.used_dirs >= max_dirs) continue; if (stats.free_inodes < min_inodes) continue; if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } fallback: ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); if (desc) { grp_free = ext4_free_inodes_count(sb, desc); if (grp_free && grp_free >= avefreei) { *group = grp; return 0; } } } if (avefreei) { /* * The free-inodes counter is approximate, and for really small * filesystems the above test can fail to find any blockgroups */ avefreei = 0; goto fallback_retry; } return -1; } static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* * Try to place the inode is the same flex group as its * parent. If we can't find space, use the Orlov algorithm to * find another flex group, and store that information in the * parent directory's inode information so that use that flex * group for future allocations. */ if (flex_size > 1) { int retry = 0; try_again: parent_group &= ~(flex_size-1); last = parent_group + flex_size; if (last > ngroups) last = ngroups; for (i = parent_group; i < last; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = i; return 0; } } if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { retry = 1; parent_group = EXT4_I(parent)->i_last_alloc_group; goto try_again; } /* * If this didn't work, use the Orlov search algorithm * to find a new flex group; we pass in the mode to * avoid the topdir algorithms. */ *group = parent_group + flex_size; if (*group > ngroups) *group = 0; return find_group_orlov(sb, parent, group, mode, NULL); } /* * Try to place the inode in its parent directory */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; /* * We're going to place this inode in a different blockgroup from its * parent. We want to cause files in a common directory to all land in * the same blockgroup. But we want files which are in a different * directory which shares a blockgroup with our parent to land in a * different blockgroup. * * So add our directory's i_ino into the starting point for the hash. */ *group = (*group + parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some free * blocks. */ for (i = 1; i < ngroups; i <<= 1) { *group += i; if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; } /* * That failed: try linear search for a free inode, even if that group * has no free blocks. */ *group = parent_group; for (i = 0; i < ngroups; i++) { if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc)) return 0; } return -1; } /* * In no journal mode, if an inode has recently been deleted, we want * to avoid reusing it until we're reasonably sure the inode table * block has been written back to disk. (Yes, these values are * somewhat arbitrary...) */ #define RECENTCY_MIN 60 #define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int offset, ret = 0; int recentcy = RECENTCY_MIN; u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out. */ goto out; offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); /* i_dtime is only 32 bits on disk, but we only care about relative * times in the range of a few minutes (i.e. long enough to sync a * recently-deleted inode to disk), so using the low 32 bits of the * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; if (dtime && time_before32(dtime, now) && time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } static int find_inode_bit(struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap, unsigned long *ino) { bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); next: *ino = ext4_find_next_zero_bit((unsigned long *) bitmap->b_data, EXT4_INODES_PER_GROUP(sb), *ino); if (*ino >= EXT4_INODES_PER_GROUP(sb)) goto not_found; if (check_recently_deleted && recently_deleted(sb, group, *ino)) { recently_deleted_ino = *ino; *ino = *ino + 1; if (*ino < EXT4_INODES_PER_GROUP(sb)) goto next; goto not_found; } return 1; not_found: if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) return 0; /* * Not reusing recently deleted inodes is mostly a preference. We don't * want to report ENOSPC or skew allocation patterns because of that. * So return even recently deleted inode if we could find better in the * given range. */ *ino = recently_deleted_ino; return 1; } int ext4_mark_inode_used(struct super_block *sb, int ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL; struct ext4_group_desc *gdp; ext4_group_t group; int bit; int err = -EFSCORRUPTED; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) goto out; group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (IS_ERR(inode_bitmap_bh)) return PTR_ERR(inode_bitmap_bh); if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) { err = 0; goto out; } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !group_desc_bh) { err = -EINVAL; goto out; } ext4_set_bit(bit, inode_bitmap_bh->b_data); BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } err = sync_dirty_buffer(inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh); sync_dirty_buffer(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (bit >= free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - bit - 1)); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); sync_dirty_buffer(group_desc_bh); out: return err; } static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, bool encrypt) { struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) return PTR_ERR(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); nblocks += (S_ISDIR(mode) ? 2 : 1) * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, acl_size, true /* is_create */); posix_acl_release(p); } #endif #ifdef CONFIG_SECURITY { int num_security_xattrs = 1; #ifdef CONFIG_INTEGRITY num_security_xattrs++; #endif /* * We assume that security xattrs are never more than 1k. * In practice they are under 128 bytes. */ nblocks += num_security_xattrs * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, 1024, true /* is_create */); } #endif if (encrypt) nblocks += __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, FSCRYPT_SET_CONTEXT_MAX_SIZE, true /* is_create */); return nblocks; } /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of * the groups with above-average free space, that group with the fewest * directories already is chosen. * * For other inodes, search forward from the parent directory's block * group to find a free inode. */ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err; struct inode *ret; ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp = NULL; bool encrypt = false; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); sb = dir->i_sb; sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); /* * Initialize owners and quota early so that we don't have to account * for quota initialization worst case in standard inode creating * transaction */ if (owner) { inode->i_mode = mode; i_uid_write(inode, owner[0]); i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; } else inode_init_owner(idmap, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); if (!(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_prepare_new_inode(dir, inode, &encrypt); if (err) goto out; } err = dquot_initialize(inode); if (err) goto out; if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); if (ret2 < 0) { err = ret2; goto out; } nblocks += ret2; } if (!goal) goal = sbi->s_inode_goal; if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ret2 = 0; goto got_group; } if (S_ISDIR(mode)) ret2 = find_group_orlov(sb, dir, &group, mode, qstr); else ret2 = find_group_other(sb, dir, &group, mode); got_group: EXT4_I(dir)->i_last_alloc_group = group; err = -ENOSPC; if (ret2 == -1) goto out; /* * Normally we will only go through one pass of this loop, * unless we get unlucky and it turns out the group we selected * had its last inode grabbed by someone else. */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto out; /* * Check free inodes count before loading bitmap. */ if (ext4_free_inodes_count(sb, gdp) == 0) goto next_group; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); /* * Skip groups with already-known suspicious inode * tables */ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ if (((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; goto next_group; } repeat_in_this_group: ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (!ret2) goto next_group; if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(NULL, dir->i_sb, line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); if (ret2) { /* Someone already took the bit. Repeat the search * with lock held. */ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (ret2) { ext4_set_bit(ino, inode_bitmap_bh->b_data); ret2 = 0; } else { ret2 = 1; /* we didn't grab the inode */ } } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: if (++group == ngroups) group = 0; } err = -ENOSPC; goto out; got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, EXT4_JTR_NONE); if (err) { brelse(block_bitmap_bh); ext4_std_error(sb, err); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; struct ext4_group_info *grp = NULL; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); if (!grp) { err = -EFSCORRUPTED; goto out; } down_read(&grp->alloc_sem); /* * protect vs itable * lazyinit */ } ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (ino > free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) up_read(&grp->alloc_sem); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (S_ISDIR(mode)) { ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); if (sbi->s_log_groups_per_flex) { ext4_group_t f = ext4_flex_group(sbi, group); atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, f)->used_dirs); } } if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) { ext4_std_error(sb, err); goto out; } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, flex_group)->free_inodes); } inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; simple_inode_init_ts(inode); ei->i_crtime = inode_get_mtime(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; ei->i_disksize = 0; /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { /* * Likely a bitmap corruption causing inode to be allocated * twice. */ err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); if (err) goto fail_drop; /* * Since the encryption xattr will always be unique, create it first so * that it's less likely to end up in an external xattr block and * prevent its deduplication. */ if (encrypt) { err = fscrypt_set_context(inode, handle); if (err) goto fail_free_drop; } if (!(ei->i_flags & EXT4_EA_INODE_FL)) { err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; err = ext4_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } if (ext4_handle_valid(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; ei->i_datasync_tid = handle->h_transaction->t_tid; } err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); goto fail_free_drop; } ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); fail_drop: clear_nlink(inode); unlock_new_inode(inode); out: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); } /* Verify that we are loading a valid orphan from disk */ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; int err = -EFSCORRUPTED; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) return ERR_CAST(bitmap_bh); /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include * inodes that were being truncated, so we can't check i_nlink==0. */ if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error_err(sb, -err, "couldn't read orphan inode %lu (err %d)", ino, err); brelse(bitmap_bh); return inode; } /* * If the orphans has i_nlinks > 0 then it should be able to * be truncated, otherwise it won't be removed from the orphan * list during processing and an infinite loop will result. * Similarly, it must not be a bad inode. */ if ((inode->i_nlink && !ext4_can_truncate(inode)) || is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); return inode; bad_orphan: ext4_error(sb, "bad orphan inode %lu", ino); if (bitmap_bh) printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_ERR "max_ino=%lu\n", max_ino); printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); return ERR_PTR(err); } unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_inodes: " "stored = %u, computed = %lu, %lu\n", le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); return desc_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; #endif } /* Called at mount-time, super-block is locked */ unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; ext4_group_t i, ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; count += ext4_used_dirs_count(sb, gdp); } return count; } /* * Zeroes not yet zeroed inode table - just write zeroes through the whole * inode table. Must be called without any spinlock held. The only place * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct buffer_head *group_desc_bh; handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; /* * We do not need to lock this, because we are the only one * handling this flag. */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } down_write(&grp->alloc_sem); /* * If inode bitmap was already initialized there may be some * used inodes so we need to skip blocks with used inodes in * inode table. */ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { used_inos = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); /* Bogus inode unused count? */ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { ext4_error(sb, "Something is wrong with group %u: " "used itable blocks: %d; " "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; goto err_out; } used_inos += group * EXT4_INODES_PER_GROUP(sb); /* * Are there some uninitialized inodes in the inode table * before the first normal inode? */ if ((used_blks != sbi->s_itb_per_group) && (used_inos < EXT4_FIRST_INO(sb))) { ext4_error(sb, "Something is wrong with group %u: " "itable unused count: %u; " "itables initialized count: %ld", group, ext4_itable_unused_count(sb, gdp), used_inos); ret = 1; goto err_out; } } blk = ext4_inode_table(sb, gdp) + used_blks; num = sbi->s_itb_per_group - used_blks; BUFFER_TRACE(group_desc_bh, "get_write_access"); ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (ret) goto err_out; /* * Skip zeroout if the inode table is full. But we set the ZEROED * flag anyway, because obviously, when it is full it does not need * further zeroing. */ if (unlikely(num == 0)) goto skip_zeroout; ext4_debug("going to zero out inode table in group %d\n", group); ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); if (ret < 0) goto err_out; if (barrier) blkdev_issue_flush(sb->s_bdev); skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); ret = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); err_out: up_write(&grp->alloc_sem); ext4_journal_stop(handle); out: return ret; } |
4790 1292 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BH_H #define _LINUX_BH_H #include <linux/instruction_pointer.h> #include <linux/preempt.h> #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { preempt_count_add(cnt); barrier(); } #endif static inline void local_bh_disable(void) { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } extern void _local_bh_enable(void); extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); static inline void local_bh_enable_ip(unsigned long ip) { __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); } static inline void local_bh_enable(void) { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } #ifdef CONFIG_PREEMPT_RT extern bool local_bh_blocked(void); #else static inline bool local_bh_blocked(void) { return false; } #endif #endif /* _LINUX_BH_H */ |
91 377 377 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/truncate.h * * Common inline functions needed for truncate support */ /* * Truncate blocks that were not used by write. We have to truncate the * pagecache as well so that corresponding buffers get properly unmapped. */ static inline void ext4_truncate_failed_write(struct inode *inode) { struct address_space *mapping = inode->i_mapping; /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ filemap_invalidate_lock(mapping); truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); filemap_invalidate_unlock(mapping); } /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) { ext4_lblk_t needed; needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); /* Give ourselves just enough room to cope with inodes in which * i_blocks is corrupt: we've seen disk corruptions in the past * which resulted in random data in an inode which looked enough * like a regular file for ext4 to try to delete it. Things * will go a bit crazy if that happens, but at least we should * try not to panic the whole kernel. */ if (needed < 2) needed = 2; /* But we need to bound the transaction so we don't overflow the * journal. */ if (needed > EXT4_MAX_TRANS_DATA) needed = EXT4_MAX_TRANS_DATA; return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; } |
1410 1492 1492 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_IOMAP_H #define LINUX_IOMAP_H 1 #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/blk_types.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/mm_types.h> #include <linux/blkdev.h> struct address_space; struct fiemap_extent_info; struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; struct iov_iter; struct kiocb; struct page; struct vm_area_struct; struct vm_fault; /* * Types of block ranges for iomap mappings: */ #define IOMAP_HOLE 0 /* no blocks allocated, need allocation */ #define IOMAP_DELALLOC 1 /* delayed allocation blocks */ #define IOMAP_MAPPED 2 /* blocks allocated at @addr */ #define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */ #define IOMAP_INLINE 4 /* data inline in the inode */ /* * Flags reported by the file system from iomap_begin: * * IOMAP_F_NEW indicates that the blocks have been newly allocated and need * zeroing for areas that no data is copied to. * * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access * written data and requires fdatasync to commit them to persistent storage. * This needs to take into account metadata changes that *may* be made at IO * completion, such as file size updates from direct IO. * * IOMAP_F_SHARED indicates that the blocks are shared, and will need to be * unshared as part a write. * * IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block * mappings. * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. * * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent * rather than a file data extent. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) #ifdef CONFIG_BUFFER_HEAD #define IOMAP_F_BUFFER_HEAD (1U << 4) #else #define IOMAP_F_BUFFER_HEAD 0 #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) /* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. * * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file * range it covers needs to be remapped by the high level before the operation * can proceed. */ #define IOMAP_F_SIZE_CHANGED (1U << 8) #define IOMAP_F_STALE (1U << 9) /* * Flags from 0x1000 up are for file system specific usage: */ #define IOMAP_F_PRIVATE (1U << 12) /* * Magic value for addr: */ #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */ struct iomap_folio_ops; struct iomap { u64 addr; /* disk offset of mapping, bytes */ loff_t offset; /* file offset of mapping, bytes */ u64 length; /* length of mapping, bytes */ u16 type; /* type of mapping */ u16 flags; /* flags for mapping */ struct block_device *bdev; /* block device for I/O */ struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; void *private; /* filesystem private */ const struct iomap_folio_ops *folio_ops; u64 validity_cookie; /* used with .iomap_valid() */ }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } /* * Returns the inline data pointer for logical offset @pos. */ static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos) { return iomap->inline_data + pos - iomap->offset; } /* * Check if the mapping's length is within the valid range for inline data. * This is used to guard against accessing data beyond the page inline_data * points at. */ static inline bool iomap_inline_data_valid(const struct iomap *iomap) { return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data); } /* * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio * and put_folio will be called for each folio written to. This only applies * to buffered writes as unbuffered writes will not typically have folios * associated with them. * * When get_folio succeeds, put_folio will always be called to do any * cleanup work necessary. put_folio is responsible for unlocking and putting * @folio. */ struct iomap_folio_ops { struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos, unsigned len); void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied, struct folio *folio); /* * Check that the cached iomap still maps correctly to the filesystem's * internal extent map. FS internal extent maps can change while iomap * is iterating a cached iomap, so this hook allows iomap to detect that * the iomap needs to be refreshed during a long running write * operation. * * The filesystem can store internal state (e.g. a sequence number) in * iomap->validity_cookie when the iomap is first mapped to be able to * detect changes between mapping time and whenever .iomap_valid() is * called. * * This is called with the folio over the specified file position held * locked by the iomap code. */ bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /* * Flags for iomap_begin / iomap_end. No flag implies a read. */ #define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ #define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ #define IOMAP_NOWAIT (1 << 5) /* do not block */ #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */ #define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */ #ifdef CONFIG_FS_DAX #define IOMAP_DAX (1 << 8) /* DAX mapping */ #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ struct iomap_ops { /* * Return the existing mapping at pos, or reserve space starting at * pos for up to length, as long as we can do it as a single mapping. * The actual length is returned in iomap->length. */ int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap); /* * Commit and/or unreserve space previous allocated using iomap_begin. * Written indicates the length of the successful write operation which * needs to be commited, while the rest needs to be unreserved. * Written might be zero if no data was written. */ int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap); }; /** * struct iomap_iter - Iterate through a range of a file * @inode: Set at the start of the iteration and should not change. * @pos: The current file position we are operating on. It is updated by * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. * @processed: The number of bytes processed by the body in the most recent * iteration, or a negative errno. 0 causes the iteration to stop. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations */ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; s64 processed; unsigned flags; struct iomap iomap; struct iomap srcmap; void *private; }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); /** * iomap_length - length of the current iomap iteration * @iter: iteration structure * * Returns the length that the operation applies to for the current iteration. */ static inline u64 iomap_length(const struct iomap_iter *iter) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); return min(iter->len, end - iter->pos); } /** * iomap_iter_srcmap - return the source map for the current iomap iteration * @i: iteration structure * * Write operations on file systems with reflink support might require a * source and a destination map. This function retourns the source map * for a given operation, which may or may no be identical to the destination * map in &i->iomap. */ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) { if (i->srcmap.type != IOMAP_HOLE) return &i->srcmap; return &i->iomap; } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, int (*punch)(struct inode *inode, loff_t pos, loff_t length)); int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops); sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); /* * Structure for writeback I/O completions. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on * the range starting at offset. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset); /* * Optional, allows the file systems to perform actions just before * submitting the bio and/or override the bio end_io handler for complex * operations like copy on write extent manipulation or unwritten extent * conversions. */ int (*prepare_ioend)(struct iomap_ioend *ioend, int status); /* * Optional, allows the file system to discard state on a page where * we failed to submit any I/O. */ void (*discard_folio)(struct folio *folio, loff_t pos); }; struct iomap_writepage_ctx { struct iomap iomap; struct iomap_ioend *ioend; const struct iomap_writeback_ops *ops; }; void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); void iomap_sort_ioends(struct list_head *ioend_list); int iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, struct iomap_writepage_ctx *wpc, const struct iomap_writeback_ops *ops); /* * Flags for direct I/O ->end_io: */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, loff_t file_offset); /* * Filesystems wishing to attach private information to a direct io bio * must provide a ->submit_io method that attaches the additional * information to the bio and changes the ->bi_end_io callback to a * custom function. This function should, at a minimum, perform any * relevant post-processing of the bio and end with a call to * iomap_dio_bio_end_io. */ struct bio_set *bio_set; }; /* * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not * synchronous. */ #define IOMAP_DIO_FORCE_WAIT (1 << 0) /* * Do not allocate blocks or zero partial blocks, but instead fall back to * the caller by returning -EAGAIN. Used to optimize direct I/O writes that * are not aligned to the file system block size. */ #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1) /* * When a page fault occurs, return a partial synchronous result and allow * the caller to retry the rest of the operation after dealing with the page * fault. */ #define IOMAP_DIO_PARTIAL (1 << 2) ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); void iomap_dio_bio_end_io(struct bio *bio); #ifdef CONFIG_SWAP struct file; struct swap_info_struct; int iomap_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *pagespan, const struct iomap_ops *ops); #else # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ #endif /* LINUX_IOMAP_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/posix_acl.h (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H #include <linux/bug.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <uapi/linux/posix_acl.h> struct user_namespace; struct posix_acl_entry { short e_tag; unsigned short e_perm; union { kuid_t e_uid; kgid_t e_gid; }; }; struct posix_acl { refcount_t a_refcount; struct rcu_head a_rcu; unsigned int a_count; struct posix_acl_entry a_entries[]; }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++) /* * Duplicate an ACL handle. */ static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) { if (acl) refcount_inc(&acl->a_refcount); return acl; } /* * Free an ACL handle. */ static inline void posix_acl_release(struct posix_acl *acl) { if (acl && refcount_dec_and_test(&acl->a_refcount)) kfree_rcu(acl, a_rcu); } /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(int, gfp_t); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) { inode->i_acl = NULL; inode->i_default_acl = NULL; } int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size); #else static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; } #define simple_set_acl NULL static inline int simple_acl_create(struct inode *dir, struct inode *inode) { return 0; } static inline void cache_no_acl(struct inode *inode) { } static inline int posix_acl_create(struct inode *inode, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { *default_acl = *acl = NULL; return 0; } static inline void forget_all_cached_acls(struct inode *inode) { } static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; } static inline int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { return 0; } #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); #endif /* __LINUX_POSIX_ACL_H */ |
213 213 210 210 210 213 213 213 210 207 207 213 213 213 213 207 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 | // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/string.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/rbtree.h> #include <linux/wait.h> #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> #include <net/busy_poll.h> /* * LOCKING: * There are three level of locking required by epoll : * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * The epnested_mutex is acquired when inserting an epoll fd onto another * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) struct epoll_filefd { struct file *file; int fd; } __packed; /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next; /* The "base" pointer is set to the container "struct epitem" */ struct epitem *base; /* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; }; /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* * Protected by file->f_lock, true for to-be-released epitem already * removed from the "struct file" items list; together with * eventpoll->refcount orchestrates "struct eventpoll" disposal */ bool dying; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; /* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* Lock which protects rdllist and ovflist */ rwlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* wakeup_source used when ep_scan_ready_list is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ struct user_struct *user; struct file *file; /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; /* * usage count, used together with epitem->dying to * orchestrate the disposal of this struct */ refcount_t refcount; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* tracks wakeup nests for lockdep validation */ u8 nests; #endif }; /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; /* * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; /* Used for cycles detection */ static DEFINE_MUTEX(epnested_mutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epnested_mutex. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { if (head) kmem_cache_free(ephead_cache, head); } static void list_file(struct file *file) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { head->next = tfile_check_list; tfile_check_list = head; } } static void unlist_file(struct epitems_head *head) { struct epitems_head *to_free = head; struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); if (p) { struct epitem *epi= container_of(p, struct epitem, fllink); spin_lock(&epi->ffd.file->f_lock); if (!hlist_empty(&head->epitems)) to_free = NULL; head->next = NULL; spin_unlock(&epi->ffd.file->f_lock); } free_ephead(to_free); } #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long long_zero; static long long_max = LONG_MAX; static struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &long_zero, .extra2 = &long_max, }, }; static void __init epoll_sysctls_init(void) { register_sysctl("fs/epoll", epoll_table); } #else #define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { ffd->file = file; ffd->fd = fd; } /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; return ep_events_available(ep) || busy_loop_timeout(start_time); } /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled */ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, BUSY_POLL_BUDGET); if (ep_events_available(ep)) return true; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ ep->napi_id = 0; return false; } return false; } /* * Set epoll busy poll NAPI ID from sk. */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { struct eventpoll *ep; unsigned int napi_id; struct socket *sock; struct sock *sk; if (!net_busy_loop_on()) return; sock = sock_from_file(epi->ffd.file); if (!sock) return; sk = sock->sk; if (!sk) return; napi_id = READ_ONCE(sk->sk_napi_id); ep = epi->ep; /* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID */ if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; } #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) { return false; } static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } #endif /* CONFIG_NET_RX_BUSY_POLL */ /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; u8 nests = 0; /* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock. */ if (epi) { if ((is_file_epoll(epi->ffd.file))) { ep_src = epi->ffd.file->private_data; nests = ep_src->nests; } else { nests = 1; } } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock. */ whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; while ((pwq = *p) != NULL) { *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } /* call only when ep->mtx is held */ static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) { return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); } /* call only when ep->mtx is held */ static inline void ep_pm_stay_awake(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); if (ws) __pm_stay_awake(ws); } static inline bool ep_has_wakeup_source(struct epitem *epi) { return rcu_access_pointer(epi->ws) ? true : false; } /* call when ep->mtx cannot be held (ep_poll_callback) */ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) { struct wakeup_source *ws; rcu_read_lock(); ws = rcu_dereference(epi->ws); if (ws) __pm_stay_awake(ws); rcu_read_unlock(); } /* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, struct list_head *txlist) { struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); } write_unlock_irq(&ep->lock); } static void epi_rcu_free(struct rcu_head *head) { struct epitem *epi = container_of(head, struct epitem, rcu); kmem_cache_free(epi_cache, epi); } static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); } /* * Returns true if the event poll can be disposed */ static bool ep_refcount_dec_and_test(struct eventpoll *ep) { if (!refcount_dec_and_test(&ep->refcount)) return false; WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); return true; } static void ep_free(struct eventpoll *ep) { mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); kfree(ep); } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. * If the dying flag is set, do the removal only if force is true. * This prevents ep_clear_and_put() from dropping all the ep references * while running concurrently with eventpoll_release_file(). * Returns true if the eventpoll can be disposed. */ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head; lockdep_assert_irqs_enabled(); /* * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); if (epi->dying && !force) { spin_unlock(&file->f_lock); return false; } to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { file->f_ep = NULL; if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next)) to_free = v; } } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ call_rcu(&epi->rcu, epi_rcu_free); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); } /* * ep_remove variant for callers owing an additional reference to the ep */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { WARN_ON_ONCE(__ep_remove(ep, epi, false)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } /* * Walks through the whole tree and try to free each "struct epitem". * Note that ep_remove_safe() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't * dispose it. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); ep_remove_safe(ep, epi); cond_resched(); } dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_clear_and_put(ep); return 0; } static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct file *file = epi->ffd.file; __poll_t res; pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); return res & epi->event.events; } static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; mutex_lock(&ep->mtx); for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); } #endif /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, }; /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; bool dispose; /* * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from * touching the epitems list before eventpoll_release_file() can access * the ep->mtx. */ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); epi->dying = true; spin_unlock(&file->f_lock); /* * ep access is safe as we still own a reference to the ep * struct */ ep = epi->ep; mutex_lock(&ep->mtx); dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); goto again; } spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) { struct eventpoll *ep; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } #ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; struct epitem *epi; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) return epi; else toff--; } cond_resched(); } return NULL; } struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff) { struct file *file_raw; struct eventpoll *ep; struct epitem *epi; if (!is_file_epoll(file)) return ERR_PTR(-EINVAL); ep = file->private_data; mutex_lock(&ep->mtx); epi = ep_find_tfd(ep, tfd, toff); if (epi) file_raw = epi->ffd.file; else file_raw = ERR_PTR(-ENOENT); mutex_unlock(&ep->mtx); return file_raw; } #endif /* CONFIG_KCMP */ /* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * * Beware: it is necessary to prevent any other modifications of the * existing list until all changes are completed, in other words * concurrent list_add_tail_lockless() calls should be protected * with a read lock, where write lock acts as a barrier which * makes sure all list_add_tail_lockless() calls are fully * completed. * * Also an element can be locklessly added to the list only in one * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, struct list_head *head) { struct list_head *prev; /* * This is simple 'new->next = head' operation, but cmpxchg() * is used in order to detect that same element has been just * added to the list from another CPU: the winner observes * new->next == new. */ if (!try_cmpxchg(&new->next, &new, head)) return false; /* * Initially ->next of a new element must be updated with the head * (we are inserting to the tail) and only then pointers are atomically * exchanged. XCHG guarantees memory ordering, thus ->next should be * updated before pointers are actually swapped and pointers are * swapped before prev->next is updated. */ prev = xchg(&head->prev, new); /* * It is safe to modify prev->next and new->prev, because a new element * is added only to the tail and new->next is updated before XCHG. */ prev->next = new; new->prev = prev; return true; } /* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; /* Fast preliminary check */ if (epi->next != EP_UNACTIVE_PTR) return false; /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; /* Atomically exchange tail */ epi->next = xchg(&ep->ovflist, epi); return true; } /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. * * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_scan_ready_list(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called * concurrently for the same @epi from different CPUs if poll table was inited * with several wait queues entries. Plural wakeup from different CPUs of a * single wait queue is serialized by wq.lock, but the case when multiple wait * queues are used should be detected accordingly. This is detected using * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); unsigned long flags; int ewake = 0; read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue(). */ list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() * takes whead->lock held by the caller. Once we nullify it, * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } return ewake; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq; if (unlikely(!epi)) // an earlier allocation has failed return; pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); if (unlikely(!pwq)) { epq->epi = NULL; return; } init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); pwq->next = epi->pwqlist; epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) { p = &parent->rb_right; leftmost = false; } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; if (++path_count[nests] > path_limits[nests]) return -1; return 0; } static void path_count_init(void) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) path_count[i] = 0; } static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) error = path_count_inc(depth); else error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } return error; } /** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ static int reverse_path_check(void) { struct epitems_head *p; for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error; path_count_init(); rcu_read_lock(); error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) return error; } return 0; } static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); ws = wakeup_source_register(NULL, n.name.name); release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; rcu_assign_pointer(epi->ws, ws); return 0; } /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); RCU_INIT_POINTER(epi->ws, NULL); /* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu */ synchronize_rcu(); wakeup_source_unregister(ws); } static int attach_epitem(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; struct eventpoll *ep = NULL; if (is_file_epoll(file)) ep = file->private_data; if (ep) { head = &ep->refs; } else if (!READ_ONCE(file->f_ep)) { allocate: to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); if (!to_free) return -ENOMEM; head = &to_free->epitems; } spin_lock(&file->f_lock); if (!file->f_ep) { if (unlikely(!head)) { spin_unlock(&file->f_lock); goto allocate; } file->f_ep = head; to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); spin_unlock(&file->f_lock); free_ephead(to_free); return 0; } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, pwake = 0; __poll_t revents; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; if (is_file_epoll(tfile)) tep = tfile->private_data; lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) return -ENOSPC; percpu_counter_inc(&ep->user->epoll_watches); if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); if (tep) mutex_unlock(&tep->mtx); /* * ep_remove_safe() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference. */ ep_get(ep); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { ep_remove_safe(ep, epi); return error; } } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (unlikely(!epq.epi)) { ep_remove_safe(ep, epi); return -ENOMEM; } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, const struct epoll_event *event) { int pwake = 0; poll_table pt; lockdep_assert_irqs_enabled(); init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; LIST_HEAD(txlist); poll_table pt; int res = 0; /* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly. */ if (fatal_signal_pending(current)) return -EINTR; init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx. */ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; if (res >= maxevents) break; /* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; events = epoll_put_uevent(revents, epi->event.data, events); if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) res = -EFAULT; break; } res++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) { struct timespec64 now; if (ms < 0) return NULL; if (!ms) { to->tv_sec = 0; to->tv_nsec = 0; return to; } to->tv_sec = ms / MSEC_PER_SEC; to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); ktime_get_ts64(&now); *to = timespec64_add_safe(now, *to); return to; } /* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it. */ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); /* * Pairs with list_empty_careful in ep_poll, and ensures future loop * iterations see the cause of this wakeup. */ list_del_init_careful(&wq_entry->entry); return ret; } /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { int res, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { slack = select_estimate_accuracy(timeout); to = &expires; *to = timespec64_to_ktime(*timeout); } else if (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; } /* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ eavail = ep_events_available(ep); while (1) { if (eavail) { /* * Try to transfer events to user space. In case we get * 0 events and there's still timeout left over, we go * trying again in search of more luck. */ res = ep_send_events(ep, events, maxevents); if (res) return res; } if (timed_out) return 0; eavail = ep_busy_loop(ep, timed_out); if (eavail) continue; if (signal_pending(current)) return -EINTR; /* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally. */ init_wait(&wait); wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier. */ __set_current_state(TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_scan_ready_list() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important. */ eavail = ep_events_available(ep); if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); write_unlock_irq(&ep->lock); if (!eavail) timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ eavail = 1; if (!list_empty_careful(&wait.entry)) { write_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events. */ if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); write_unlock_irq(&ep->lock); } } } /** * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: %zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) error = -1; else error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert(). */ list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); return error; } /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { inserting_into = ep; return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) { rcu_read_lock(); while (tfile_check_list != EP_UNACTIVE_PTR) { struct epitems_head *head = tfile_check_list; tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); } /* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_clear_and_put(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); } #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epev->events &= ~EPOLLWAKEUP; } #else static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { epev->events &= ~EPOLLWAKEUP; } #endif static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { if (!nonblock) { mutex_lock_nested(mutex, depth); return 0; } if (mutex_trylock(mutex)) return 0; return -EAGAIN; } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; error = -EBADF; f = fdget(epfd); if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ tf = fdget(fd); if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!file_can_poll(tf.file)) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data; /* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { tep = tf.file->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; } } /* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) { /* * The eventpoll itself is still alive: the refcount * can't go to zero here. */ ep_remove_safe(ep, epi); error = 0; } else { error = -ENOENT; } break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, epds); } } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; mutex_unlock(&epnested_mutex); } fdput(tf); error_fput: fdput(f); error_return: return error; } /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { struct epoll_event epds; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT; return do_epoll_ctl(epfd, op, fd, &epds, false); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { int error; struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(events, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* Get the "struct file *" for the eventpoll file */ f = fdget(epfd); if (!f.file) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; if (!is_file_epoll(f.file)) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data; /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, to); error_fput: fdput(f); return error; } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { struct timespec64 to; return do_epoll_wait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout)); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ static int do_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to, const sigset_t __user *sigmask, size_t sigsetsize) { int error; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, to); restore_saved_sigmask_unless(error == -EINTR); return error; } SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 to; return do_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { long err; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); restore_saved_sigmask_unless(err == -EINTR); return err; } COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 to; return do_compat_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_compat_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #endif static int __init eventpoll_init(void) { struct sysinfo si; si_meminfo(&si); /* * Allows top 4% of lomem to be allocated for epoll watches (per user). */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; BUG_ON(max_user_watches < 0); /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } fs_initcall(eventpoll_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi_bus.h - ACPI Bus Driver ($Revision: 22 $) * * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef __ACPI_BUS_H__ #define __ACPI_BUS_H__ #include <linux/device.h> #include <linux/property.h> struct acpi_handle_list { u32 count; acpi_handle *handles; }; /* acpi_utils.h */ acpi_status acpi_extract_package(union acpi_object *package, struct acpi_buffer *format, struct acpi_buffer *buffer); acpi_status acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, unsigned long long *data); bool acpi_evaluate_reference(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, struct acpi_handle_list *list); bool acpi_handle_list_equal(struct acpi_handle_list *list1, struct acpi_handle_list *list2); void acpi_handle_list_replace(struct acpi_handle_list *dst, struct acpi_handle_list *src); void acpi_handle_list_free(struct acpi_handle_list *list); bool acpi_device_dep(acpi_handle target, acpi_handle match); acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); acpi_status acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); bool acpi_has_method(acpi_handle handle, char *name); acpi_status acpi_execute_simple_method(acpi_handle handle, char *method, u64 arg); acpi_status acpi_evaluate_ej0(acpi_handle handle); acpi_status acpi_evaluate_lck(acpi_handle handle, int lock); acpi_status acpi_evaluate_reg(acpi_handle handle, u8 space_id, u32 function); bool acpi_ata_match(acpi_handle handle); bool acpi_bay_match(acpi_handle handle); bool acpi_dock_match(acpi_handle handle); bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs); union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4); #ifdef CONFIG_ACPI static inline union acpi_object * acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { union acpi_object *obj; obj = acpi_evaluate_dsm(handle, guid, rev, func, argv4); if (obj && obj->type != type) { ACPI_FREE(obj); obj = NULL; } return obj; } #endif #define ACPI_INIT_DSM_ARGV4(cnt, eles) \ { \ .package.type = ACPI_TYPE_PACKAGE, \ .package.count = (cnt), \ .package.elements = (eles) \ } bool acpi_dev_found(const char *hid); bool acpi_dev_present(const char *hid, const char *uid, s64 hrv); bool acpi_reduced_hardware(void); #ifdef CONFIG_ACPI struct proc_dir_entry; #define ACPI_BUS_FILE_ROOT "acpi" extern struct proc_dir_entry *acpi_root_dir; enum acpi_bus_device_type { ACPI_BUS_TYPE_DEVICE = 0, ACPI_BUS_TYPE_POWER, ACPI_BUS_TYPE_PROCESSOR, ACPI_BUS_TYPE_THERMAL, ACPI_BUS_TYPE_POWER_BUTTON, ACPI_BUS_TYPE_SLEEP_BUTTON, ACPI_BUS_TYPE_ECDT_EC, ACPI_BUS_DEVICE_TYPE_COUNT }; struct acpi_driver; struct acpi_device; /* * ACPI Scan Handler * ----------------- */ struct acpi_hotplug_profile { struct kobject kobj; int (*scan_dependent)(struct acpi_device *adev); void (*notify_online)(struct acpi_device *adev); bool enabled:1; bool demand_offline:1; }; static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile( struct kobject *kobj) { return container_of(kobj, struct acpi_hotplug_profile, kobj); } struct acpi_scan_handler { const struct acpi_device_id *ids; struct list_head list_node; bool (*match)(const char *idstr, const struct acpi_device_id **matchid); int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id); void (*detach)(struct acpi_device *dev); void (*bind)(struct device *phys_dev); void (*unbind)(struct device *phys_dev); struct acpi_hotplug_profile hotplug; }; /* * ACPI Hotplug Context * -------------------- */ struct acpi_hotplug_context { struct acpi_device *self; int (*notify)(struct acpi_device *, u32); void (*uevent)(struct acpi_device *, u32); void (*fixup)(struct acpi_device *); }; /* * ACPI Driver * ----------- */ typedef int (*acpi_op_add) (struct acpi_device * device); typedef void (*acpi_op_remove) (struct acpi_device *device); typedef void (*acpi_op_notify) (struct acpi_device * device, u32 event); struct acpi_device_ops { acpi_op_add add; acpi_op_remove remove; acpi_op_notify notify; }; #define ACPI_DRIVER_ALL_NOTIFY_EVENTS 0x1 /* system AND device events */ struct acpi_driver { char name[80]; char class[80]; const struct acpi_device_id *ids; /* Supported Hardware IDs */ unsigned int flags; struct acpi_device_ops ops; struct device_driver drv; struct module *owner; }; /* * ACPI Device * ----------- */ /* Status (_STA) */ struct acpi_device_status { u32 present:1; u32 enabled:1; u32 show_in_ui:1; u32 functional:1; u32 battery_present:1; u32 reserved:27; }; /* Flags */ struct acpi_device_flags { u32 dynamic_status:1; u32 removable:1; u32 ejectable:1; u32 power_manageable:1; u32 match_driver:1; u32 initialized:1; u32 visited:1; u32 hotplug_notify:1; u32 is_dock_station:1; u32 of_compatible_ok:1; u32 coherent_dma:1; u32 cca_seen:1; u32 enumeration_by_parent:1; u32 honor_deps:1; u32 reserved:18; }; /* File System */ struct acpi_device_dir { struct proc_dir_entry *entry; }; #define acpi_device_dir(d) ((d)->dir.entry) /* Plug and Play */ typedef char acpi_bus_id[8]; typedef u64 acpi_bus_address; typedef char acpi_device_name[40]; typedef char acpi_device_class[20]; struct acpi_hardware_id { struct list_head list; const char *id; }; struct acpi_pnp_type { u32 hardware_id:1; u32 bus_address:1; u32 platform_id:1; u32 backlight:1; u32 reserved:28; }; struct acpi_device_pnp { acpi_bus_id bus_id; /* Object name */ int instance_no; /* Instance number of this object */ struct acpi_pnp_type type; /* ID type */ acpi_bus_address bus_address; /* _ADR */ char *unique_id; /* _UID */ struct list_head ids; /* _HID and _CIDs */ acpi_device_name device_name; /* Driver-determined */ acpi_device_class device_class; /* " */ union acpi_object *str_obj; /* unicode string for _STR method */ }; #define acpi_device_bid(d) ((d)->pnp.bus_id) #define acpi_device_adr(d) ((d)->pnp.bus_address) const char *acpi_device_hid(struct acpi_device *device); #define acpi_device_uid(d) ((d)->pnp.unique_id) #define acpi_device_name(d) ((d)->pnp.device_name) #define acpi_device_class(d) ((d)->pnp.device_class) /* Power Management */ struct acpi_device_power_flags { u32 explicit_get:1; /* _PSC present? */ u32 power_resources:1; /* Power resources */ u32 inrush_current:1; /* Serialize Dx->D0 */ u32 power_removed:1; /* Optimize Dx->D0 */ u32 ignore_parent:1; /* Power is independent of parent power state */ u32 dsw_present:1; /* _DSW present? */ u32 reserved:26; }; struct acpi_device_power_state { struct { u8 valid:1; u8 explicit_set:1; /* _PSx present? */ u8 reserved:6; } flags; int power; /* % Power (compared to D0) */ int latency; /* Dx->D0 time (microseconds) */ struct list_head resources; /* Power resources referenced */ }; struct acpi_device_power { int state; /* Current state */ struct acpi_device_power_flags flags; struct acpi_device_power_state states[ACPI_D_STATE_COUNT]; /* Power states (D0-D3Cold) */ u8 state_for_enumeration; /* Deepest power state for enumeration */ }; struct acpi_dep_data { struct list_head node; acpi_handle supplier; acpi_handle consumer; bool honor_dep; bool met; bool free_when_met; }; /* Performance Management */ struct acpi_device_perf_flags { u8 reserved:8; }; struct acpi_device_perf_state { struct { u8 valid:1; u8 reserved:7; } flags; u8 power; /* % Power (compared to P0) */ u8 performance; /* % Performance ( " ) */ int latency; /* Px->P0 time (microseconds) */ }; struct acpi_device_perf { int state; struct acpi_device_perf_flags flags; int state_count; struct acpi_device_perf_state *states; }; /* Wakeup Management */ struct acpi_device_wakeup_flags { u8 valid:1; /* Can successfully enable wakeup? */ u8 notifier_present:1; /* Wake-up notify handler has been installed */ }; struct acpi_device_wakeup_context { void (*func)(struct acpi_device_wakeup_context *context); struct device *dev; }; struct acpi_device_wakeup { acpi_handle gpe_device; u64 gpe_number; u64 sleep_state; struct list_head resources; struct acpi_device_wakeup_flags flags; struct acpi_device_wakeup_context context; struct wakeup_source *ws; int prepare_count; int enable_count; }; struct acpi_device_physical_node { unsigned int node_id; struct list_head node; struct device *dev; bool put_online:1; }; struct acpi_device_properties { const guid_t *guid; union acpi_object *properties; struct list_head list; void **bufs; }; /* ACPI Device Specific Data (_DSD) */ struct acpi_device_data { const union acpi_object *pointer; struct list_head properties; const union acpi_object *of_compatible; struct list_head subnodes; }; struct acpi_gpio_mapping; #define ACPI_DEVICE_SWNODE_ROOT 0 /* * The maximum expected number of CSI-2 data lanes. * * This number is not expected to ever have to be equal to or greater than the * number of bits in an unsigned long variable, but if it needs to be increased * above that limit, code will need to be adjusted accordingly. */ #define ACPI_DEVICE_CSI2_DATA_LANES 8 #define ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH 8 enum acpi_device_swnode_dev_props { ACPI_DEVICE_SWNODE_DEV_ROTATION, ACPI_DEVICE_SWNODE_DEV_CLOCK_FREQUENCY, ACPI_DEVICE_SWNODE_DEV_LED_MAX_MICROAMP, ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_MICROAMP, ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_TIMEOUT_US, ACPI_DEVICE_SWNODE_DEV_NUM_OF, ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES }; enum acpi_device_swnode_port_props { ACPI_DEVICE_SWNODE_PORT_REG, ACPI_DEVICE_SWNODE_PORT_NUM_OF, ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES }; enum acpi_device_swnode_ep_props { ACPI_DEVICE_SWNODE_EP_REMOTE_EP, ACPI_DEVICE_SWNODE_EP_BUS_TYPE, ACPI_DEVICE_SWNODE_EP_REG, ACPI_DEVICE_SWNODE_EP_CLOCK_LANES, ACPI_DEVICE_SWNODE_EP_DATA_LANES, ACPI_DEVICE_SWNODE_EP_LANE_POLARITIES, /* TX only */ ACPI_DEVICE_SWNODE_EP_LINK_FREQUENCIES, ACPI_DEVICE_SWNODE_EP_NUM_OF, ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES }; /* * Each device has a root software node plus two times as many nodes as the * number of CSI-2 ports. */ #define ACPI_DEVICE_SWNODE_PORT(port) (2 * (port) + 1) #define ACPI_DEVICE_SWNODE_EP(endpoint) \ (ACPI_DEVICE_SWNODE_PORT(endpoint) + 1) /** * struct acpi_device_software_node_port - MIPI DisCo for Imaging CSI-2 port * @port_name: Port name. * @data_lanes: "data-lanes" property values. * @lane_polarities: "lane-polarities" property values. * @link_frequencies: "link_frequencies" property values. * @port_nr: Port number. * @crs_crs2_local: _CRS CSI2 record present (i.e. this is a transmitter one). * @port_props: Port properties. * @ep_props: Endpoint properties. * @remote_ep: Reference to the remote endpoint. */ struct acpi_device_software_node_port { char port_name[ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH + 1]; u32 data_lanes[ACPI_DEVICE_CSI2_DATA_LANES]; u32 lane_polarities[ACPI_DEVICE_CSI2_DATA_LANES + 1 /* clock lane */]; u64 link_frequencies[ACPI_DEVICE_CSI2_DATA_LANES]; unsigned int port_nr; bool crs_csi2_local; struct property_entry port_props[ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES]; struct property_entry ep_props[ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES]; struct software_node_ref_args remote_ep[1]; }; /** * struct acpi_device_software_nodes - Software nodes for an ACPI device * @dev_props: Device properties. * @nodes: Software nodes for root as well as ports and endpoints. * @nodeprts: Array of software node pointers, for (un)registering them. * @ports: Information related to each port and endpoint within a port. * @num_ports: The number of ports. */ struct acpi_device_software_nodes { struct property_entry dev_props[ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES]; struct software_node *nodes; const struct software_node **nodeptrs; struct acpi_device_software_node_port *ports; unsigned int num_ports; }; /* Device */ struct acpi_device { u32 pld_crc; int device_type; acpi_handle handle; /* no handle for fixed hardware */ struct fwnode_handle fwnode; struct list_head wakeup_list; struct list_head del_list; struct acpi_device_status status; struct acpi_device_flags flags; struct acpi_device_pnp pnp; struct acpi_device_power power; struct acpi_device_wakeup wakeup; struct acpi_device_perf performance; struct acpi_device_dir dir; struct acpi_device_data data; struct acpi_scan_handler *handler; struct acpi_hotplug_context *hp; struct acpi_device_software_nodes *swnodes; const struct acpi_gpio_mapping *driver_gpios; void *driver_data; struct device dev; unsigned int physical_node_count; unsigned int dep_unmet; struct list_head physical_node_list; struct mutex physical_node_lock; void (*remove)(struct acpi_device *); }; /* Non-device subnode */ struct acpi_data_node { const char *name; acpi_handle handle; struct fwnode_handle fwnode; struct fwnode_handle *parent; struct acpi_device_data data; struct list_head sibling; struct kobject kobj; struct completion kobj_done; }; extern const struct fwnode_operations acpi_device_fwnode_ops; extern const struct fwnode_operations acpi_data_fwnode_ops; extern const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode); bool is_acpi_data_node(const struct fwnode_handle *fwnode); static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode)); } #define to_acpi_device_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \ \ is_acpi_device_node(__to_acpi_device_node_fwnode) ? \ container_of(__to_acpi_device_node_fwnode, \ struct acpi_device, fwnode) : \ NULL; \ }) #define to_acpi_data_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode; \ \ is_acpi_data_node(__to_acpi_data_node_fwnode) ? \ container_of(__to_acpi_data_node_fwnode, \ struct acpi_data_node, fwnode) : \ NULL; \ }) static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_static_fwnode_ops; } static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode, const char *name) { return is_acpi_data_node(fwnode) ? (!strcmp(to_acpi_data_node(fwnode)->name, name)) : false; } static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) { return &adev->fwnode; } static inline void *acpi_driver_data(struct acpi_device *d) { return d->driver_data; } #define to_acpi_device(d) container_of(d, struct acpi_device, dev) #define to_acpi_driver(d) container_of(d, struct acpi_driver, drv) static inline struct acpi_device *acpi_dev_parent(struct acpi_device *adev) { if (adev->dev.parent) return to_acpi_device(adev->dev.parent); return NULL; } static inline void acpi_set_device_status(struct acpi_device *adev, u32 sta) { *((u32 *)&adev->status) = sta; } static inline void acpi_set_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp) { hp->self = adev; adev->hp = hp; } void acpi_initialize_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp, int (*notify)(struct acpi_device *, u32), void (*uevent)(struct acpi_device *, u32)); /* acpi_device.dev.bus == &acpi_bus_type */ extern struct bus_type acpi_bus_type; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); /* * Events * ------ */ struct acpi_bus_event { struct list_head node; acpi_device_class device_class; acpi_bus_id bus_id; u32 type; u32 data; }; extern struct kobject *acpi_kobj; extern int acpi_bus_generate_netlink_event(const char*, const char*, u8, int); void acpi_bus_private_data_handler(acpi_handle, void *); int acpi_bus_get_private_data(acpi_handle, void **); int acpi_bus_attach_private_data(acpi_handle, void *); void acpi_bus_detach_private_data(acpi_handle); int acpi_dev_install_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler, void *context); void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler); extern int acpi_notifier_call_chain(struct acpi_device *, u32, u32); extern int register_acpi_notifier(struct notifier_block *); extern int unregister_acpi_notifier(struct notifier_block *); /* * External Functions */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta); int acpi_bus_get_status(struct acpi_device *device); int acpi_bus_set_power(acpi_handle handle, int state); const char *acpi_power_state_string(int state); int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); void acpi_device_fix_up_power_extended(struct acpi_device *adev); void acpi_device_fix_up_power_children(struct acpi_device *adev); int acpi_bus_update_power(acpi_handle handle, int *state_p); int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); void acpi_dev_power_up_children_with_adr(struct acpi_device *adev); u8 acpi_dev_power_state_for_wake(struct acpi_device *adev); int acpi_device_power_add_dependent(struct acpi_device *adev, struct device *dev); void acpi_device_power_remove_dependent(struct acpi_device *adev, struct device *dev); #ifdef CONFIG_PM bool acpi_bus_can_wakeup(acpi_handle handle); #else static inline bool acpi_bus_can_wakeup(acpi_handle handle) { return false; } #endif void acpi_scan_lock_acquire(void); void acpi_scan_lock_release(void); void acpi_lock_hp_context(void); void acpi_unlock_hp_context(void); int acpi_scan_add_handler(struct acpi_scan_handler *handler); int acpi_bus_register_driver(struct acpi_driver *driver); void acpi_bus_unregister_driver(struct acpi_driver *driver); int acpi_bus_scan(acpi_handle handle); void acpi_bus_trim(struct acpi_device *start); acpi_status acpi_bus_get_ejd(acpi_handle handle, acpi_handle * ejd); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids); void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len); static inline bool acpi_device_enumerated(struct acpi_device *adev) { return adev && adev->flags.initialized && adev->flags.visited; } /** * module_acpi_driver(acpi_driver) - Helper macro for registering an ACPI driver * @__acpi_driver: acpi_driver struct * * Helper macro for ACPI drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_acpi_driver(__acpi_driver) \ module_driver(__acpi_driver, acpi_bus_register_driver, \ acpi_bus_unregister_driver) /* * Bind physical devices with ACPI devices */ struct acpi_bus_type { struct list_head list; const char *name; bool (*match)(struct device *dev); struct acpi_device * (*find_companion)(struct device *); void (*setup)(struct device *); }; int register_acpi_bus_type(struct acpi_bus_type *); int unregister_acpi_bus_type(struct acpi_bus_type *); int acpi_bind_one(struct device *dev, struct acpi_device *adev); int acpi_unbind_one(struct device *dev); enum acpi_bridge_type { ACPI_BRIDGE_TYPE_PCIE = 1, ACPI_BRIDGE_TYPE_CXL, }; struct acpi_pci_root { struct acpi_device * device; struct pci_bus *bus; u16 segment; int bridge_type; struct resource secondary; /* downstream bus range */ u32 osc_support_set; /* _OSC state of support bits */ u32 osc_control_set; /* _OSC state of control bits */ u32 osc_ext_support_set; /* _OSC state of extended support bits */ u32 osc_ext_control_set; /* _OSC state of extended control bits */ phys_addr_t mcfg_addr; }; /* helper */ struct iommu_ops; bool acpi_dma_supported(const struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_iommu_fwspec_init(struct device *dev, u32 id, struct fwnode_handle *fwnode, const struct iommu_ops *ops); int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map); int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id); static inline int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) { return acpi_dma_configure_id(dev, attr, NULL); } struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children); struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev, acpi_bus_address adr); int acpi_is_root_bridge(acpi_handle); struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle); int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state); int acpi_disable_wakeup_device_power(struct acpi_device *dev); #ifdef CONFIG_X86 bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status); bool acpi_quirk_skip_acpi_ac_and_battery(void); int acpi_install_cmos_rtc_space_handler(acpi_handle handle); void acpi_remove_cmos_rtc_space_handler(acpi_handle handle); #else static inline bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status) { return false; } static inline bool acpi_quirk_skip_acpi_ac_and_battery(void) { return false; } static inline int acpi_install_cmos_rtc_space_handler(acpi_handle handle) { return 1; } static inline void acpi_remove_cmos_rtc_space_handler(acpi_handle handle) { } #endif #if IS_ENABLED(CONFIG_X86_ANDROID_TABLETS) bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev); int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip); bool acpi_quirk_skip_gpio_event_handlers(void); #else static inline bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev) { return false; } static inline int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip) { *skip = false; return 0; } static inline bool acpi_quirk_skip_gpio_event_handlers(void) { return false; } #endif #ifdef CONFIG_PM void acpi_pm_wakeup_event(struct device *dev); acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)); acpi_status acpi_remove_pm_notifier(struct acpi_device *adev); bool acpi_pm_device_can_wakeup(struct device *dev); int acpi_pm_device_sleep_state(struct device *, int *, int); int acpi_pm_set_device_wakeup(struct device *dev, bool enable); #else static inline void acpi_pm_wakeup_event(struct device *dev) { } static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)) { return AE_SUPPORT; } static inline acpi_status acpi_remove_pm_notifier(struct acpi_device *adev) { return AE_SUPPORT; } static inline bool acpi_pm_device_can_wakeup(struct device *dev) { return false; } static inline int acpi_pm_device_sleep_state(struct device *d, int *p, int m) { if (p) *p = ACPI_STATE_D0; return (m >= ACPI_STATE_D0 && m <= ACPI_STATE_D3_COLD) ? m : ACPI_STATE_D0; } static inline int acpi_pm_set_device_wakeup(struct device *dev, bool enable) { return -ENODEV; } #endif #ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT bool acpi_sleep_state_supported(u8 sleep_state); #else static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; } #endif #ifdef CONFIG_ACPI_SLEEP u32 acpi_target_system_state(void); #else static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; } #endif static inline bool acpi_device_power_manageable(struct acpi_device *adev) { return adev->flags.power_manageable; } static inline bool acpi_device_can_wakeup(struct acpi_device *adev) { return adev->wakeup.flags.valid; } static inline bool acpi_device_can_poweroff(struct acpi_device *adev) { return adev->power.states[ACPI_STATE_D3_COLD].flags.valid || ((acpi_gbl_FADT.header.revision < 6) && adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set); } int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer); static inline bool acpi_dev_hid_match(struct acpi_device *adev, const char *hid2) { const char *hid1 = acpi_device_hid(adev); return hid1 && hid2 && !strcmp(hid1, hid2); } static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2) { const char *uid1 = acpi_device_uid(adev); return uid1 && uid2 && !strcmp(uid1, uid2); } static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) { u64 uid1; return !acpi_dev_uid_to_integer(adev, &uid1) && uid1 == uid2; } #define TYPE_ENTRY(type, x) \ const type: x, \ type: x #define ACPI_STR_TYPES(match) \ TYPE_ENTRY(unsigned char *, match), \ TYPE_ENTRY(signed char *, match), \ TYPE_ENTRY(char *, match), \ TYPE_ENTRY(void *, match) /** * acpi_dev_uid_match - Match device by supplied UID * @adev: ACPI device to match. * @uid2: Unique ID of the device. * * Matches UID in @adev with given @uid2. * * Returns: %true if matches, %false otherwise. */ #define acpi_dev_uid_match(adev, uid2) \ _Generic(uid2, \ /* Treat @uid2 as a string for acpi string types */ \ ACPI_STR_TYPES(acpi_str_uid_match), \ /* Treat as an integer otherwise */ \ default: acpi_int_uid_match)(adev, uid2) /** * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. * @hid2: Hardware ID of the device. * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID. * * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 * will be treated as a match. If user wants to validate @uid2, it should be * done before calling this function. * * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise. */ #define acpi_dev_hid_uid_match(adev, hid2, uid2) \ (acpi_dev_hid_match(adev, hid2) && \ (!(uid2) || acpi_dev_uid_match(adev, uid2))) void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier, struct acpi_device *start); /** * for_each_acpi_consumer_dev - iterate over the consumer ACPI devices for a * given supplier * @supplier: Pointer to the supplier's ACPI device * @consumer: Pointer to &struct acpi_device to hold the consumer, initially NULL */ #define for_each_acpi_consumer_dev(supplier, consumer) \ for (consumer = acpi_dev_get_next_consumer_dev(supplier, NULL); \ consumer; \ consumer = acpi_dev_get_next_consumer_dev(supplier, consumer)) struct acpi_device * acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv); struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv); /** * for_each_acpi_dev_match - iterate over ACPI devices that matching the criteria * @adev: pointer to the matching ACPI device, NULL at the end of the loop * @hid: Hardware ID of the device. * @uid: Unique ID of the device, pass NULL to not check _UID * @hrv: Hardware Revision of the device, pass -1 to not check _HRV * * The caller is responsible for invoking acpi_dev_put() on the returned device. */ #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = acpi_dev_get_first_match_dev(hid, uid, hrv); \ adev; \ adev = acpi_dev_get_next_match_dev(adev, hid, uid, hrv)) static inline struct acpi_device *acpi_dev_get(struct acpi_device *adev) { return adev ? to_acpi_device(get_device(&adev->dev)) : NULL; } static inline void acpi_dev_put(struct acpi_device *adev) { if (adev) put_device(&adev->dev); } struct acpi_device *acpi_fetch_acpi_dev(acpi_handle handle); struct acpi_device *acpi_get_acpi_dev(acpi_handle handle); static inline void acpi_put_acpi_dev(struct acpi_device *adev) { acpi_dev_put(adev); } #else /* CONFIG_ACPI */ static inline int register_acpi_bus_type(void *bus) { return 0; } static inline int unregister_acpi_bus_type(void *bus) { return 0; } #endif /* CONFIG_ACPI */ #endif /*__ACPI_BUS_H__*/ |
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 3 3 3 3 1 1 1 1 1 1 1 1 1 3 3 1 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * SMC statistics netlink routines * * Copyright IBM Corp. 2021 * * Author(s): Guvenc Gulce */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/ctype.h> #include <linux/smc.h> #include <net/genetlink.h> #include <net/sock.h> #include "smc_netlink.h" #include "smc_stats.h" int smc_stats_init(struct net *net) { net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); if (!net->smc.fback_rsn) goto err_fback; net->smc.smc_stats = alloc_percpu(struct smc_stats); if (!net->smc.smc_stats) goto err_stats; mutex_init(&net->smc.mutex_fback_rsn); return 0; err_stats: kfree(net->smc.fback_rsn); err_fback: return -ENOMEM; } void smc_stats_exit(struct net *net) { kfree(net->smc.fback_rsn); if (net->smc.smc_stats) free_percpu(net->smc.smc_stats); } static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_rmbcnt *stats_rmb_cnt; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TX_RMB_STATS) stats_rmb_cnt = &stats->smc[tech].rmb_tx; else stats_rmb_cnt = &stats->smc[tech].rmb_rx; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, stats_rmb_cnt->reuse_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, stats_rmb_cnt->buf_size_small_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, stats_rmb_cnt->buf_size_small_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, stats_rmb_cnt->buf_full_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, stats_rmb_cnt->buf_full_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, stats_rmb_cnt->alloc_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, stats_rmb_cnt->dgrade_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_memsize *stats_pload; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) stats_pload = &stats->smc[tech].tx_pd; else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) stats_pload = &stats->smc[tech].rx_pd; else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) stats_pload = &stats->smc[tech].tx_rmbsize; else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) stats_pload = &stats->smc[tech].rx_rmbsize; else goto errout; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, stats_pload->buf[SMC_BUF_8K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, stats_pload->buf[SMC_BUF_16K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, stats_pload->buf[SMC_BUF_32K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, stats_pload->buf[SMC_BUF_64K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, stats_pload->buf[SMC_BUF_128K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, stats_pload->buf[SMC_BUF_256K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, stats_pload->buf[SMC_BUF_512K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, stats_pload->buf[SMC_BUF_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, stats_pload->buf[SMC_BUF_G_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, struct smc_stats *stats, int tech) { struct smc_stats_tech *smc_tech; struct nlattr *attrs; smc_tech = &stats->smc[tech]; if (tech == SMC_TYPE_D) attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); else attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); if (!attrs) goto errout; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_SIZE)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, smc_tech->clnt_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, smc_tech->clnt_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, smc_tech->srv_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, smc_tech->srv_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, smc_tech->rx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, smc_tech->tx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, smc_tech->rx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, smc_tech->tx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, 0, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, smc_tech->cork_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, smc_tech->ndly_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, smc_tech->splice_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, smc_tech->urg_data_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); struct smc_stats *stats; struct nlattr *attrs; int cpu, i, size; void *nlh; u64 *src; u64 *sum; if (cb_ctx->pos[0]) goto errmsg; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_STATS); if (!attrs) goto errnest; stats = kzalloc(sizeof(*stats), GFP_KERNEL); if (!stats) goto erralloc; size = sizeof(*stats) / sizeof(u64); for_each_possible_cpu(cpu) { src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); sum = (u64 *)stats; for (i = 0; i < size; i++) *(sum++) += *(src++); } if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) goto errattr; if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, stats->clnt_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, stats->srv_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); cb_ctx->pos[0] = 1; kfree(stats); return skb->len; errattr: kfree(stats); erralloc: nla_nest_cancel(skb, attrs); errnest: genlmsg_cancel(skb, nlh); errmsg: return skb->len; } static int smc_nl_get_fback_details(struct sk_buff *skb, struct netlink_callback *cb, int pos, bool is_srv) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int cnt_reported = cb_ctx->pos[2]; struct smc_stats_fback *trgt_arr; struct nlattr *attrs; int rc = 0; void *nlh; if (is_srv) trgt_arr = &net->smc.fback_rsn->srv[0]; else trgt_arr = &net->smc.fback_rsn->clnt[0]; if (!trgt_arr[pos].fback_code) return -ENODATA; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_FBACK_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) goto errattr; if (!cnt_reported) { if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, net->smc.fback_rsn->srv_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, net->smc.fback_rsn->clnt_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; cnt_reported = 1; } if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, trgt_arr[pos].fback_code)) goto errattr; if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, trgt_arr[pos].count)) goto errattr; cb_ctx->pos[2] = cnt_reported; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return rc; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int rc_srv = 0, rc_clnt = 0, k; int skip_serv = cb_ctx->pos[1]; int snum = cb_ctx->pos[0]; bool is_srv = true; mutex_lock(&net->smc.mutex_fback_rsn); for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { if (k < snum) continue; if (!skip_serv) { rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); if (rc_srv && rc_srv != -ENODATA) break; } else { skip_serv = 0; } rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); if (rc_clnt && rc_clnt != -ENODATA) { skip_serv = 1; break; } if (rc_clnt == -ENODATA && rc_srv == -ENODATA) break; } mutex_unlock(&net->smc.mutex_fback_rsn); cb_ctx->pos[1] = skip_serv; cb_ctx->pos[0] = k; return skb->len; } |
1004 1190 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MATH64_H #define _LINUX_MATH64_H #include <linux/types.h> #include <linux/math.h> #include <vdso/math64.h> #include <asm/div64.h> #if BITS_PER_LONG == 64 #define div64_long(x, y) div64_s64((x), (y)) #define div64_ul(x, y) div64_u64((x), (y)) /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * @remainder: pointer to unsigned 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor * * This is commonly provided by 32bit archs to provide an optimized 64bit * divide. */ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div_s64_rem - signed 64bit divide with 32bit divisor with remainder * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * @remainder: pointer to signed 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * @remainder: pointer to unsigned 64bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Return: dividend / divisor */ static inline u64 div64_u64(u64 dividend, u64 divisor) { return dividend / divisor; } /** * div64_s64 - signed 64bit divide with 64bit divisor * @dividend: signed 64bit dividend * @divisor: signed 64bit divisor * * Return: dividend / divisor */ static inline s64 div64_s64(s64 dividend, s64 divisor) { return dividend / divisor; } #elif BITS_PER_LONG == 32 #define div64_long(x, y) div_s64((x), (y)) #define div64_ul(x, y) div_u64((x), (y)) #ifndef div_u64_rem static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = do_div(dividend, divisor); return dividend; } #endif #ifndef div_s64_rem extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); #endif #ifndef div64_u64_rem extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); #endif #ifndef div64_u64 extern u64 div64_u64(u64 dividend, u64 divisor); #endif #ifndef div64_s64 extern s64 div64_s64(s64 dividend, s64 divisor); #endif #endif /* BITS_PER_LONG */ /** * div_u64 - unsigned 64bit divide with 32bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * * This is the most common 64bit divide and should be used if possible, * as many 32bit archs can optimize this variant better than a full 64bit * divide. * * Return: dividend / divisor */ #ifndef div_u64 static inline u64 div_u64(u64 dividend, u32 divisor) { u32 remainder; return div_u64_rem(dividend, divisor, &remainder); } #endif /** * div_s64 - signed 64bit divide with 32bit divisor * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * * Return: dividend / divisor */ #ifndef div_s64 static inline s64 div_s64(s64 dividend, s32 divisor) { s32 remainder; return div_s64_rem(dividend, divisor, &remainder); } #endif u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder); #ifndef mul_u32_u32 /* * Many a GCC version messes this up and generates a 64x64 mult :-( */ static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #endif #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u64_shr */ #else #ifndef mul_u64_u32_shr static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah, al; u64 ret; al = a; ah = a >> 32; ret = mul_u32_u32(al, mul) >> shift; if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } rl, rm, rn, rh, a0, b0; u64 c; a0.ll = a; b0.ll = b; rl.ll = mul_u32_u32(a0.l.low, b0.l.low); rm.ll = mul_u32_u32(a0.l.low, b0.l.high); rn.ll = mul_u32_u32(a0.l.high, b0.l.low); rh.ll = mul_u32_u32(a0.l.high, b0.l.high); /* * Each of these lines computes a 64-bit intermediate result into "c", * starting at bits 32-95. The low 32-bits go into the result of the * multiplication, the high 32-bits are carried into the next step. */ rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low; rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low; rh.l.high = (c >> 32) + rh.l.high; /* * The 128-bit result of the multiplication is in rl.ll and rh.ll, * shift it right and throw away the high part of the result. */ if (shift == 0) return rl.ll; if (shift < 64) return (rl.ll >> shift) | (rh.ll << (64 - shift)); return rh.ll >> (shift & 63); } #endif /* mul_u64_u64_shr */ #endif #ifndef mul_s64_u64_shr static inline u64 mul_s64_u64_shr(s64 a, u64 b, unsigned int shift) { u64 ret; /* * Extract the sign before the multiplication and put it back * afterwards if needed. */ ret = mul_u64_u64_shr(abs(a), b, shift); if (a < 0) ret = -((s64) ret); return ret; } #endif /* mul_s64_u64_shr */ #ifndef mul_u64_u32_div static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } u, rl, rh; u.ll = a; rl.ll = mul_u32_u32(u.l.low, mul); rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high; /* Bits 32-63 of the result will be in rh.l.low. */ rl.l.high = do_div(rh.ll, divisor); /* Bits 0-31 of the result will be in rl.l.low. */ do_div(rl.ll, divisor); rl.l.high = rh.l.low; return rl.ll; } #endif /* mul_u64_u32_div */ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); /** * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up * @ll: unsigned 64bit dividend * @d: unsigned 64bit divisor * * Divide unsigned 64bit dividend by unsigned 64bit divisor * and round up. * * Return: dividend / divisor rounded up */ #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) /** * DIV64_U64_ROUND_CLOSEST - unsigned 64bit divide with 64bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Divide unsigned 64bit dividend by unsigned 64bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV64_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u64 _tmp = (divisor); div64_u64((dividend) + _tmp / 2, _tmp); }) /** * DIV_U64_ROUND_CLOSEST - unsigned 64bit divide with 32bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * * Divide unsigned 64bit dividend by unsigned 32bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u32 _tmp = (divisor); div_u64((u64)(dividend) + _tmp / 2, _tmp); }) /** * DIV_S64_ROUND_CLOSEST - signed 64bit divide with 32bit divisor rounded to nearest integer * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * * Divide signed 64bit dividend by signed 32bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV_S64_ROUND_CLOSEST(dividend, divisor)( \ { \ s64 __x = (dividend); \ s32 __d = (divisor); \ ((__x > 0) == (__d > 0)) ? \ div_s64((__x + (__d / 2)), __d) : \ div_s64((__x - (__d / 2)), __d); \ } \ ) #endif /* _LINUX_MATH64_H */ |
1190 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SMP_H #define __LINUX_SMP_H /* * Generic SMP support * Alan Cox. <alan@redhat.com> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/list.h> #include <linux/cpumask.h> #include <linux/init.h> #include <linux/smp_types.h> typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { struct __call_single_node node; smp_call_func_t func; void *info; }; #define CSD_INIT(_func, _info) \ (struct __call_single_data){ .func = (_func), .info = (_info), } /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ typedef struct __call_single_data call_single_data_t __aligned(sizeof(struct __call_single_data)); #define INIT_CSD(_csd, _func, _info) \ do { \ *(_csd) = CSD_INIT((_func), (_info)); \ } while (0) /* * Enqueue a llist_node on the call_single_queue; be very careful, read * flush_smp_call_function_queue() in detail. */ extern void __smp_call_single_queue(int cpu, struct llist_node *node); /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); int smp_call_function_single_async(int cpu, call_single_data_t *csd); /* * Cpus stopping functions in panic. All have default weak definitions. * Architecture-dependent code may override them. */ void __noreturn panic_smp_self_stop(void); void __noreturn nmi_panic_self_stop(struct pt_regs *regs); void crash_smp_send_stop(void); /* * Call a function on all processors */ static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) { on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask); } /** * on_each_cpu_mask(): Run a function on processors specified by * cpumask, which may include the local processor. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. The * exception is that it may be used during early boot while * early_boot_irqs_disabled is set. */ static inline void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(NULL, func, info, wait, mask); } /* * Call a function on each processor for which the supplied function * cond_func returns a positive value. This may include the local * processor. May be used during early boot while early_boot_irqs_disabled is * set. Use local_irq_save/restore() instead of local_irq_disable/enable(). */ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } #ifdef CONFIG_SMP #include <linux/preempt.h> #include <linux/compiler.h> #include <linux/thread_info.h> #include <asm/smp.h> /* * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. * (defined in asm header): */ /* * stops all CPUs but the current one: */ extern void smp_send_stop(void); /* * sends a 'reschedule' event to another CPU: */ extern void arch_smp_send_reschedule(int cpu); /* * scheduler_ipi() is inline so can't be passed as callback reason, but the * callsite IP should be sufficient for root-causing IPIs sent from here. */ #define smp_send_reschedule(cpu) ({ \ trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \ arch_smp_send_reschedule(cpu); \ }) /* * Prepare machine for booting other CPUs. */ extern void smp_prepare_cpus(unsigned int max_cpus); /* * Bring a CPU up */ extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle); /* * Final polishing of CPUs */ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ void smp_call_function(smp_call_func_t func, void *info, int wait); void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); void wake_up_all_idle_cpus(void); /* * Generic and arch helpers */ void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. */ void smp_prepare_boot_cpu(void); extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); extern int __boot_cpu_id; static inline int get_boot_cpu_id(void) { return __boot_cpu_id; } #else /* !SMP */ static inline void smp_send_stop(void) { } /* * These macros fold the SMP functionality into a single CPU system */ #define raw_smp_processor_id() 0 static inline void up_smp_call_function(smp_call_func_t func, void *info) { } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) static inline void call_function_init(void) { } static inline int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { return smp_call_function_single(0, func, info, wait); } static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } #ifdef CONFIG_UP_LATE_INIT extern void __init up_late_init(void); static inline void smp_init(void) { up_late_init(); } #else static inline void smp_init(void) { } #endif static inline int get_boot_cpu_id(void) { return 0; } #endif /* !SMP */ /** * raw_processor_id() - get the current (unstable) CPU id * * For then you know what you are doing and need an unstable * CPU id. */ /** * smp_processor_id() - get the current (stable) CPU id * * This is the normal accessor to the CPU id and should be used * whenever possible. * * The CPU id is stable when: * * - IRQs are disabled; * - preemption is disabled; * - the task is CPU affine. * * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN * when smp_processor_id() is used when the CPU id is not stable. */ /* * Allow the architecture to differentiate between a stable and unstable read. * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a * regular asm read for the stable. */ #ifndef __smp_processor_id #define __smp_processor_id(x) raw_smp_processor_id(x) #endif #ifdef CONFIG_DEBUG_PREEMPT extern unsigned int debug_smp_processor_id(void); # define smp_processor_id() debug_smp_processor_id() #else # define smp_processor_id() __smp_processor_id() #endif #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: */ extern void arch_disable_smp_support(void); extern void arch_thaw_secondary_cpus_begin(void); extern void arch_thaw_secondary_cpus_end(void); void smp_setup_processor_id(void); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys); /* SMP core functions */ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); #endif /* __LINUX_SMP_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 | /* SPDX-License-Identifier: GPL-2.0 */ /* Interface for implementing AF_XDP zero-copy support in drivers. * Copyright(c) 2020 Intel Corporation. */ #ifndef _LINUX_XDP_SOCK_DRV_H #define _LINUX_XDP_SOCK_DRV_H #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) struct xsk_cb_desc { void *src; u8 off; u8 bytes; }; #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool); static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return XDP_PACKET_HEADROOM + pool->headroom; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { xp_set_rxq_info(pool, rxq); } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { xp_fill_cb(pool, desc); } static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { #ifdef CONFIG_NET_RX_BUSY_POLL return pool->heads[0].xdp.rxq->napi_id; #else return 0; #endif } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { xp_dma_unmap(pool, attrs); } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { struct xdp_umem *umem = pool->umem; return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs); } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_dma(xskb); } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_frame_dma(xskb); } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return xp_alloc(pool); } static inline bool xsk_is_eop_desc(struct xdp_desc *desc) { return !xp_mb_desc(desc); } /* Returns as many entries as possible up to max. 0 <= N <= max. */ static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return xp_alloc_batch(pool, xdp, max); } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return xp_can_alloc(pool, count); } static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); struct list_head *xskb_list = &xskb->pool->xskb_list; struct xdp_buff_xsk *pos, *tmp; if (likely(!xdp_buff_has_frags(xdp))) goto out; list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { list_del(&pos->xskb_list_node); xp_free(pos); } xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; out: xp_free(xskb); } static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, struct xdp_buff_xsk, xskb_list_node); if (frag) { list_del(&frag->xskb_list_node); ret = &frag->xdp; } return ret; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_dma(pool, addr); } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_data(pool, addr); } #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ 0) static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { struct xsk_tx_metadata *meta; if (!pool->tx_metadata_len) return NULL; meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return NULL; /* no way to signal the error to the user */ return meta; } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); if (!pool->dma_need_sync) return; xp_dma_sync_for_cpu(xskb); } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { xp_dma_sync_for_device(pool, dma, size); } #else static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { } static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return false; } static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max) { return 0; } static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } static inline struct xsk_buff_pool * xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return false; } static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return 0; } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { } static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { return 0; } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { return 0; } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return NULL; } static inline bool xsk_is_eop_desc(struct xdp_desc *desc) { return false; } static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return 0; } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; } static inline void xsk_buff_free(struct xdp_buff *xdp) { } static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) { return NULL; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return 0; } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; } static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { } #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_DRV_H */ |
72 292 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | /* SPDX-License-Identifier: GPL-2.0 */ /** * lib/minmax.c: windowed min/max tracker by Kathleen Nichols. * */ #ifndef MINMAX_H #define MINMAX_H #include <linux/types.h> /* A single data point for our parameterized min-max tracker */ struct minmax_sample { u32 t; /* time measurement was taken */ u32 v; /* value measured */ }; /* State for the parameterized min-max tracker */ struct minmax { struct minmax_sample s[3]; }; static inline u32 minmax_get(const struct minmax *m) { return m->s[0].v; } static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; m->s[2] = m->s[1] = m->s[0] = val; return m->s[0].v; } u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas); u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas); #endif |
62 62 62 62 62 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | // SPDX-License-Identifier: GPL-2.0-or-later /* Key permission checking * * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/security.h> #include "internal.h" /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. * * The caller must hold either a ref on cred or must hold the RCU readlock. * * Returns 0 if successful, -EACCES if access is denied based on the * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; key_perm_t kperm, mask; int ret; switch (need_perm) { default: WARN_ON(1); return -EACCES; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: goto lsm; case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; case KEY_NEED_READ: mask = KEY_OTH_READ; break; case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; } key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ if (uid_eq(key->uid, cred->fsuid)) { kperm = key->perm >> 16; goto use_these_perms; } /* use the third 8-bits of permissions for keys the caller has a group * membership in common with */ if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { if (gid_eq(key->gid, cred->fsgid)) { kperm = key->perm >> 8; goto use_these_perms; } ret = groups_search(cred->group_info, key->gid); if (ret) { kperm = key->perm >> 8; goto use_these_perms; } } /* otherwise use the least-significant 8-bits */ kperm = key->perm; use_these_perms: /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions */ if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ lsm: return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); /** * key_validate - Validate a key. * @key: The key to be validated. * * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the * key is invalidated, -EKEYREVOKED if the key's type has been removed or if * the key has been revoked or -EKEYEXPIRED if the key has expired. */ int key_validate(const struct key *key) { unsigned long flags = READ_ONCE(key->flags); time64_t expiry = READ_ONCE(key->expiry); if (flags & (1 << KEY_FLAG_INVALIDATED)) return -ENOKEY; /* check it's still accessible */ if (flags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))) return -EKEYREVOKED; /* check it hasn't expired */ if (expiry) { if (ktime_get_real_seconds() >= expiry) return -EKEYEXPIRED; } return 0; } EXPORT_SYMBOL(key_validate); |
1718 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 | /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names. */ /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); static void ib_client_put(struct ib_client *client) { if (refcount_dec_and_test(&client->uses)) complete(&client->uses_zero); } /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated. */ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries. */ static DECLARE_RWSEM(rdma_nets_rwsem); bool ib_devices_shared_netns = true; module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one. */ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) { return (ib_devices_shared_netns || net_eq(read_pnet(&dev->coredev.rdma_net), net)); } EXPORT_SYMBOL(rdma_dev_access_netns); /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays. */ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break; } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) { *indexp = xas.xa_index; if (xa_is_zero(entry)) return NULL; return entry; } return XA_ERROR(-ENOENT); } #define xan_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) /* RCU hash table mapping netdevice pointers to struct ib_port_data */ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); static void ib_unregister_work(struct work_struct *work); static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); static void __ibdev_printk(const char *level, const struct ib_device *ibdev, struct va_format *vaf) { if (ibdev && ibdev->dev.parent) dev_printk_emit(level[1] - '0', ibdev->dev.parent, "%s %s %s: %pV", dev_driver_string(ibdev->dev.parent), dev_name(ibdev->dev.parent), dev_name(&ibdev->dev), vaf); else if (ibdev) printk("%s%s: %pV", level, dev_name(&ibdev->dev), vaf); else printk("%s(NULL ib_device): %pV", level, vaf); } void ibdev_printk(const char *level, const struct ib_device *ibdev, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __ibdev_printk(level, ibdev, &vaf); va_end(args); } EXPORT_SYMBOL(ibdev_printk); #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __ibdev_printk(level, ibdev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); define_ibdev_printk_level(ibdev_alert, KERN_ALERT); define_ibdev_printk_level(ibdev_crit, KERN_CRIT); define_ibdev_printk_level(ibdev_err, KERN_ERR); define_ibdev_printk_level(ibdev_warn, KERN_WARNING); define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); define_ibdev_printk_level(ibdev_info, KERN_INFO); static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[]; }; static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { device->kverbs_provider = false; break; } } } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!rdma_dev_access_netns(device, net)) { device = NULL; goto out; } if (!ib_device_try_get(device)) device = NULL; } out: up_read(&devices_rwsem); return device; } /** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free. */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; unsigned long index; xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } /** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer. */ struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id) { struct ib_device *device; down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && device->ops.driver_id != driver_id) device = NULL; if (device) { if (!ib_device_try_get(device)) device = NULL; } up_read(&devices_rwsem); return device; } EXPORT_SYMBOL(ib_device_get_by_name); static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; int ret = 0; mutex_lock(&device->compat_devs_mutex); xa_for_each (&device->compat_devs, index, cdev) { ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) { dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n", dev_name(&device->dev)); break; } } mutex_unlock(&device->compat_devs_mutex); return ret; } int ib_device_rename(struct ib_device *ibdev, const char *name) { unsigned long index; void *client_data; int ret; down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { up_write(&devices_rwsem); return 0; } if (__ib_device_get_by_name(name)) { up_write(&devices_rwsem); return -EEXIST; } ret = device_rename(&ibdev->dev, name); if (ret) { up_write(&devices_rwsem); return ret; } strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); down_read(&ibdev->client_data_rwsem); xan_for_each_marked(&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->rename) continue; client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); up_read(&devices_rwsem); return 0; } int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) { if (use_dim > 1) return -EINVAL; ibdev->use_cq_dim = use_dim; return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; unsigned long index; struct ida inuse; int rc; int i; lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); if (strcmp(buf, dev_name(&device->dev)) != 0) continue; rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); if (rc < 0) goto out; } rc = ida_alloc(&inuse, GFP_KERNEL); if (rc < 0) goto out; rc = dev_set_name(&ibdev->dev, name, rc); out: ida_destroy(&inuse); return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); if (dev->hw_stats_data) ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); } static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* * It would be nice to pass the node GUID with the event... */ return 0; } static const void *net_namespace(const struct device *d) { const struct ib_core_device *coredev = container_of(d, struct ib_core_device, dev); return read_pnet(&coredev->rdma_net); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, }; static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption. */ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != offsetof(struct ib_device, dev)); coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); write_pnet(&coredev->rdma_net, net); } /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; if (rdma_restrack_init(device)) { kfree(device); return NULL; } rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); init_rwsem(&device->event_handler_rwsem); mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) INIT_LIST_HEAD(&device->cq_pools[i]); rwlock_init(&device->cache_lock); device->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); return device; } EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->ops.dealloc_driver) device->ops.dealloc_driver(device); /* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload. */ down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device) xa_erase(&devices, device->index); up_write(&devices_rwsem); /* Expedite releasing netdev references */ free_netdevs(device); WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); /* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed. */ static int add_client_context(struct ib_device *device, struct ib_client *client) { int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) return 0; down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks. */ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock; refcount_inc(&device->refcount); /* * Another caller to add_client_context got here first and has already * completely initialized context. */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED); up_read(&device->client_data_rwsem); return 0; out: ib_device_put(device); ib_client_put(client); out_unlock: up_write(&device->client_data_rwsem); return ret; } static void remove_client_context(struct ib_device *device, unsigned int client_id) { struct ib_client *client; void *client_data; down_write(&device->client_data_rwsem); if (!xa_get_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED)) { up_write(&device->client_data_rwsem); return; } client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); ib_device_put(device); ib_client_put(client); } static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; u32 port; if (device->port_data) return 0; /* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; /* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL; /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, size_add(rdma_end_port(device), 1)), GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu. */ device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int setup_port_data(struct ib_device *device) { u32 port; int ret; ret = alloc_port_data(device); if (ret) return ret; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } /** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port(). */ const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port) { WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable; } EXPORT_SYMBOL(ib_port_immutable_read); void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) dev->ops.get_dev_fw_str(dev, str); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned int i; rdma_for_each_port (dev, i) { u64 sp; ib_get_cached_subnet_prefix(dev, i, &sp); ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; schedule_work(&ib_policy_change_work); ib_mad_agent_security_change(); return NOTIFY_OK; } static void compatdev_release(struct device *dev) { struct ib_core_device *cdev = container_of(dev, struct ib_core_device, dev); kfree(cdev); } static int add_one_compat_dev(struct ib_device *device, struct rdma_dev_net *rnet) { struct ib_core_device *cdev; int ret; lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0; /* * Create and add compat device in all namespaces other than where it * is currently bound to. */ if (net_eq(read_pnet(&rnet->net), read_pnet(&device->coredev.rdma_net))) return 0; /* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here. */ mutex_lock(&device->compat_devs_mutex); cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) { ret = 0; goto done; } ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) { ret = -ENOMEM; goto cdev_err; } cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err; ret = device_add(&cdev->dev); if (ret) goto add_err; ret = ib_setup_port_attrs(cdev); if (ret) goto port_err; ret = xa_err(xa_store(&device->compat_devs, rnet->id, cdev, GFP_KERNEL)); if (ret) goto insert_err; mutex_unlock(&device->compat_devs_mutex); return 0; insert_err: ib_free_port_attrs(cdev); port_err: device_del(&cdev->dev); add_err: put_device(&cdev->dev); cdev_err: xa_release(&device->compat_devs, rnet->id); done: mutex_unlock(&device->compat_devs_mutex); return ret; } static void remove_one_compat_dev(struct ib_device *device, u32 id) { struct ib_core_device *cdev; mutex_lock(&device->compat_devs_mutex); cdev = xa_erase(&device->compat_devs, id); mutex_unlock(&device->compat_devs_mutex); if (cdev) { ib_free_port_attrs(cdev); device_del(&cdev->dev); put_device(&cdev->dev); } } static void remove_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; xa_for_each (&device->compat_devs, index, cdev) remove_one_compat_dev(device, index); } static int add_compat_devs(struct ib_device *device) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; lockdep_assert_held(&devices_rwsem); down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); return ret; } static void remove_all_compat_devs(void) { struct ib_compat_device *cdev; struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { unsigned long c_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&dev->compat_devs, c_index, cdev) remove_one_compat_dev(dev, c_index); up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); } static int add_all_compat_devs(void) { struct rdma_dev_net *rnet; struct ib_device *dev; unsigned long index; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned long net_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, net_index, rnet) { ret = add_one_compat_dev(dev, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); if (ret) remove_all_compat_devs(); return ret; } int rdma_compatdev_set(u8 enable) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) { up_write(&rdma_nets_rwsem); return 0; } /* enable/disable of compat devices is not supported * when more than default init_net exists. */ xa_for_each (&rdma_nets, index, rnet) { ret++; break; } if (!ret) ib_devices_shared_netns = enable; up_write(&rdma_nets_rwsem); if (ret) return -EBUSY; if (enable) ret = add_all_compat_devs(); else remove_all_compat_devs(); return ret; } static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each. */ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); WARN_ON(ret); up_write(&rdma_nets_rwsem); down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long. */ up_read(&devices_rwsem); remove_one_compat_dev(dev, rnet->id); /* * If the real device is in the NS then move it back to init. */ rdma_dev_change_netns(dev, net, &init_net); put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; write_pnet(&rnet->net, net); ret = rdma_nl_net_init(rnet); if (ret) return ret; /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) { rdma_nl_net_exit(rnet); return ret; } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode. */ down_read(&rdma_nets_rwsem); ret = add_one_compat_dev(dev, rnet); up_read(&rdma_nets_rwsem); if (ret) break; } up_read(&devices_rwsem); if (ret) rdma_dev_exit_net(net); return ret; } /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { static u32 last_id; int ret; down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); else ret = dev_set_name(&device->dev, name); if (ret) goto out; if (__ib_device_get_by_name(dev_name(&device->dev))) { ret = -ENFILE; goto out; } strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); if (ret > 0) ret = 0; out: up_write(&devices_rwsem); return ret; } /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device(). */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret; } return 0; } static void disable_device(struct ib_device *device) { u32 cid; WARN_ON(!refcount_read(&device->refcount)); down_write(&devices_rwsem); xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); /* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here. */ down_read(&clients_rwsem); cid = highest_client_id; up_read(&clients_rwsem); while (cid) { cid--; remove_client_context(device, cid); } ib_cq_pool_cleanup(device); /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); /* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled. */ remove_compat_devs(device); } /* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails. */ static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; int ret = 0; /* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration. */ refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); /* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup. */ downgrade_write(&devices_rwsem); if (device->ops.enable_driver) { ret = device->ops.enable_driver(device); if (ret) goto out; } down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); if (ret) break; } up_read(&clients_rwsem); if (!ret) ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; } static void prevent_dealloc_device(struct ib_device *ib_dev) { } /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns. */ int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device) { int ret; ret = assign_name(device, name); if (ret) return ret; /* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field. */ WARN_ON(dma_device && !dma_device->dma_parms); device->dma_device = dma_device; ret = setup_device(device); if (ret) return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } device->groups[0] = &ib_dev_attr_group; device->groups[1] = device->ops.device_group; ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup; ib_device_register_rdmacg(device); rdma_counter_init(device); /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. */ dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup; } ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *); /* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; dev_set_uevent_suppress(&device->dev, false); return ret; } dev_set_uevent_suppress(&device->dev, false); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); return 0; dev_cleanup: device_del(&device->dev); cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); cache_cleanup: ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. */ mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out; disable_device(ib_dev); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); /* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ if (ib_dev->ops.dealloc_driver && ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } out: mutex_unlock(&ib_dev->unregistration_lock); } /** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function. */ void ib_unregister_device(struct ib_device *ib_dev) { get_device(&ib_dev->dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); /** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it. */ void ib_unregister_device_and_put(struct ib_device *ib_dev) { WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); ib_device_put(ib_dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_and_put); /** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller. */ void ib_unregister_driver(enum rdma_driver_id driver_id) { struct ib_device *ib_dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); up_read(&devices_rwsem); WARN_ON(!ib_dev->ops.dealloc_driver); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); } EXPORT_SYMBOL(ib_unregister_driver); static void ib_unregister_work(struct work_struct *work) { struct ib_device *ib_dev = container_of(work, struct ib_device, unregistration_work); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } /** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed. */ void ib_unregister_device_queued(struct ib_device *ib_dev) { WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); /* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net. */ static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net) { int ret2 = -EINVAL; int ret; mutex_lock(&device->unregistration_lock); /* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock. */ if (refcount_read(&device->refcount) == 0 || !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { ret = -ENODEV; goto out; } kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); /* * At this point no one can be using the device, so it is safe to * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level. */ ret = device_rename(&device->dev, dev_name(&device->dev)); up_read(&devices_rwsem); if (ret) { dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n", __func__); /* Try and put things back and re-enable the device */ write_pnet(&device->coredev.rdma_net, cur_net); } ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device. */ dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n", __func__); } kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); out: mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd) { struct net *net; int ret; net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) { ret = PTR_ERR(net); goto net_err; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto ns_err; } /* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory. */ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } get_device(&dev->dev); ib_device_put(dev); ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); put_device(&dev->dev); put_net(net); return ret; ns_err: put_net(net); net_err: ib_device_put(dev); return ret; } static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, .id = &rdma_dev_net_id, .size = sizeof(struct rdma_dev_net), }; static int assign_client_id(struct ib_client *client) { int ret; down_write(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order. */ client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) goto out; highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); out: up_write(&clients_rwsem); return ret; } static void remove_client_id(struct ib_client *client) { down_write(&clients_rwsem); xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break; up_write(&clients_rwsem); } /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; unsigned long index; int ret; refcount_set(&client->uses, 1); init_completion(&client->uses_zero); ret = assign_client_id(client); if (ret) return ret; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { ret = add_client_context(device, client); if (ret) { up_read(&devices_rwsem); ib_unregister_client(client); return ret; } } up_read(&devices_rwsem); return 0; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; down_write(&clients_rwsem); ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); /* We do not want to have locks while calling client->remove() */ rcu_read_lock(); xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue; rcu_read_unlock(); remove_client_context(device, client->client_id); ib_device_put(device); rcu_read_lock(); } rcu_read_unlock(); /* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed. */ wait_for_completion(&client->uses_zero); remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); static int __ib_get_global_client_nl_info(const char *client_name, struct ib_client_nl_info *res) { struct ib_client *client; unsigned long index; int ret = -ENOENT; down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&clients_rwsem); return ret; } static int __ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { unsigned long index; void *client_data; int ret = -ENOENT; down_read(&ibdev->client_data_rwsem); xan_for_each_marked (&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; /* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller. */ if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&ibdev->client_data_rwsem); return ret; } /** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { int ret; if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) { request_module("rdma-client-%s", client_name); if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); } #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret; } if (WARN_ON(!res->cdev)) return -EINVAL; return 0; } /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { void *rc; if (WARN_ON(IS_ERR(data))) data = NULL; rc = xa_store(&device->client_data, client->client_id, data, GFP_KERNEL); WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); void ib_dispatch_event_clients(struct ib_event *event) { struct ib_event_handler *handler; down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); up_read(&event->device->event_handler_rwsem); } static int iw_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; struct net_device *netdev; memset(port_attr, 0, sizeof(*port_attr)); netdev = ib_device_get_netdev(device, port_num); if (!netdev) return -ENODEV; port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); if (!netif_carrier_ok(netdev)) { port_attr->state = IB_PORT_DOWN; port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; } else { rcu_read_lock(); inetdev = __in_dev_get_rcu(netdev); if (inetdev && inetdev->ifa_list) { port_attr->state = IB_PORT_ACTIVE; port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } else { port_attr->state = IB_PORT_INIT; port_attr->phys_state = IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; } rcu_read_unlock(); } dev_put(netdev); return device->ops.query_port(device, port_num, port_attr); } static int __ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { int err; memset(port_attr, 0, sizeof(*port_attr)); err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; ib_get_cached_subnet_prefix(device, port_num, &port_attr->subnet_prefix); return 0; } /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (rdma_protocol_iwarp(device, port_num)) return iw_query_port(device, port_num, port_attr); else return __ib_query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) { unsigned long flags; might_sleep(); spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) { hash_del_rcu(&pdata->ndev_hash_link); spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period */ synchronize_rcu(); spin_lock_irqsave(&ndev_hash_lock, flags); } if (pdata->netdev) hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, (uintptr_t)pdata->netdev); spin_unlock_irqrestore(&ndev_hash_lock, flags); } /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. */ ret = alloc_port_data(ib_dev); if (ret) return ret; if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL; pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } if (old_ndev) netdev_tracker_free(ndev, &pdata->netdev_tracker); if (ndev) netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); rcu_assign_pointer(pdata->netdev, ndev); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); if (old_ndev) __dev_put(old_ndev); return 0; } EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; u32 port; if (!ib_dev->port_data) return; rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (ndev) { spin_lock(&ndev_hash_lock); hash_del_rcu(&pdata->ndev_hash_link); spin_unlock(&ndev_hash_lock); /* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port) { struct ib_port_data *pdata; struct net_device *res; if (!rdma_is_port_valid(ib_dev, port)) return NULL; pdata = &ib_dev->port_data[port]; /* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev(). */ if (ib_dev->ops.get_netdev) res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (res) dev_hold(res); spin_unlock(&pdata->netdev_lock); } /* * If we are starting to unregister expedite things by preventing * propagation of an unregistering netdev. */ if (res && res->reg_state != NETREG_REGISTERED) { dev_put(res); return NULL; } return res; } /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer. */ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id) { struct ib_device *res = NULL; struct ib_port_data *cur; rcu_read_lock(); hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; } } rcu_read_unlock(); return res; } EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); if (idev) dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * * Enumerates all ib_devices and calls callback() on each device. */ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) continue; ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } up_read(&devices_rwsem); return ret; } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (!device->ops.query_pkey) return -EOPNOTSUPP; return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (device->ops.modify_port) rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); else if (rdma_protocol_roce(device, port_num) && ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) rc = 0; else rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index) { union ib_gid tmp_gid; u32 port; int ret, i; rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) continue; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; unsigned long index; void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; /* * Holding the read side guarantees that the client will not become * unregistered while we are calling get_net_dev_by_params() */ down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->get_net_dev_by_params) continue; net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, client_data); if (net_dev) break; } up_read(&dev->client_data_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) { struct ib_device_ops *dev_ops = &dev->ops; #define SET_DEVICE_OP(ptr, name) \ do { \ if (ops->name) \ if (!((ptr)->name)) \ (ptr)->name = ops->name; \ } while (0) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } if (ops->owner) { WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); dev_ops->owner = ops->owner; } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); SET_DEVICE_OP(dev_ops, alloc_ucontext); SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); SET_DEVICE_OP(dev_ops, destroy_cq); SET_DEVICE_OP(dev_ops, destroy_flow); SET_DEVICE_OP(dev_ops, destroy_flow_action); SET_DEVICE_OP(dev_ops, destroy_qp); SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_srq_entry); SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_create_listen); SET_DEVICE_OP(dev_ops, iw_destroy_listen); SET_DEVICE_OP(dev_ops, iw_get_qp); SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); SET_DEVICE_OP(dev_ops, process_mad); SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); } EXPORT_SYMBOL(ib_set_device_ops); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { struct scatterlist *s; int i; for_each_sg(sg, s, nents, i) { sg_dma_address(s) = (uintptr_t)sg_virt(s); sg_dma_len(s) = s->length; } return nents; } EXPORT_SYMBOL(ib_dma_virt_map_sg); #endif /* CONFIG_INFINIBAND_VIRT_DMA */ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { .doit = ib_nl_handle_set_timeout, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_IP_RESOLVE] = { .doit = ib_nl_handle_ip_res_resp, .flags = RDMA_NL_ADMIN_PERM, }, }; static int __init ib_core_init(void) { int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!ib_unreg_wq) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); if (!ib_comp_unbound_wq) goto err_comp; ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp_unbound; } rdma_nl_init(); ret = addr_init(); if (ret) { pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } ret = register_pernet_device(&rdma_dev_net_ops); if (ret) { pr_warn("Couldn't init compat dev. ret %d\n", ret); goto err_compat; } nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ret = roce_gid_mgmt_init(); if (ret) { pr_warn("Couldn't init RoCE GID management\n"); goto err_parent; } return 0; err_parent: rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_ibnl: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err_unbound: destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); /* ib core relies on netdev stack to first register net_ns_type_operations * ns kobject type before ib_core initialization. */ fs_initcall(ib_core_init); module_exit(ib_core_cleanup); |
3 2194 2194 2194 2194 2194 2193 2194 2194 2194 2193 2192 2194 2193 2194 2193 1910 1908 1909 1891 1891 1891 1889 1867 1867 1867 446 446 1889 1890 1890 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 | /* * Ext4 orphan inode handling */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; int ret = 0; bool found = false; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int looped = 0; /* * Find block with free orphan entry. Use CPU number for a naive hash * for a search start in the orphan file */ start = raw_smp_processor_id()*13 % oi->of_blocks; i = start; do { if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) >= 0) { found = true; break; } if (++i >= oi->of_blocks) i = 0; } while (i != start); if (!found) { /* * For now we don't grow or shrink orphan file. We just use * whatever was allocated at mke2fs time. The additional * credits we would have to reserve for each orphan inode * operation just don't seem worth it. */ return -ENOSPC; } ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return ret; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); /* Find empty slot in a block */ j = 0; do { if (looped) { /* * Did we walk through the block several times without * finding free entry? It is theoretically possible * if entries get constantly allocated and freed or * if the block is corrupted. Avoid indefinite looping * and bail. We'll use orphan list instead. */ if (looped > 3) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return -ENOSPC; } cond_resched(); } while (bdata[j]) { if (++j >= inodes_per_ob) { j = 0; looped++; } } } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != (__le32)0); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); } /* * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; bool dirty = false; if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); /* * Inode orphaned in orphan file or in orphan list? */ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || !list_empty(&EXT4_I(inode)->i_orphan)) return 0; /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); if (sbi->s_orphan_info.of_blocks) { err = ext4_orphan_file_add(handle, inode); /* * Fallback to normal orphan list of orphan file is * out of space */ if (err != -ENOSPC) return err; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > (le32_to_cpu(sbi->s_es->s_inodes_count))) { /* Insert this inode at the head of the on-disk orphan list */ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); dirty = true; } list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); mutex_unlock(&sbi->s_orphan_lock); if (dirty) { err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; if (err) { /* * We have to remove inode from in-memory list if * addition to on disk orphan list failed. Stray orphan * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } else brelse(iloc.bh); ext4_debug("superblock will point to %lu\n", inode->i_ino); ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); return err; } static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) { struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; __le32 *bdata; int blk, off; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int ret = 0; if (!handle) goto out; blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; if (WARN_ON_ONCE(blk >= oi->of_blocks)) goto out; ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) goto out; bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); bdata[off] = 0; atomic_inc(&oi->of_binfo[blk].ob_free_entries); ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); out: ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); return ret; } /* * ext4_orphan_del() removes an unlinked or truncated inode from the list * of such inodes stored on disk, because it is finally being cleaned up. */ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; if (handle) { /* Grab inode buffer early before taking global s_orphan_lock */ err = ext4_reserve_inode_write(handle, inode, &iloc); } mutex_lock(&sbi->s_orphan_lock); ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ if (!handle || err) { mutex_unlock(&sbi->s_orphan_lock); goto out_err; } ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); ext4_superblock_csum_set(inode->i_sb); unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); out_err: ext4_std_error(inode->i_sb, err); return err; out_brelse: brelse(iloc.bh); goto out_err; } #ifdef CONFIG_QUOTA static int ext4_quota_on_mount(struct super_block *sb, int type) { return dquot_quota_on_mount(sb, rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], lockdep_is_held(&sb->s_umount)), EXT4_SB(sb)->s_jquota_fmt, type); } #endif static void ext4_process_orphan(struct inode *inode, int *nr_truncates, int *nr_orphans) { struct super_block *sb = inode->i_sb; int ret; dquot_initialize(inode); if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); ext4_debug("truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); if (ret) { /* * We need to clean up the in-core orphan list * manually if ext4_truncate() failed to get a * transaction handle. */ ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); } inode_unlock(inode); (*nr_truncates)++; } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); ext4_debug("deleting unreferenced inode %lu\n", inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ } /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at * the superblock) which were deleted from all directories, but held open by * a process at the time of a crash. We walk the list and try to delete these * inodes at recovery time (only with a read-write filesystem). * * In order to keep the orphan inode chain consistent during traversal (in * case of crash during recovery), we link each inode into the superblock * orphan list_head and handle it the same way as an inode deletion during * normal operation (which journals the operations for us). * * We only do an iget() and an iput() on each inode, which is very safe if we * accidentally point at an in-use or already deleted inode. The worst that * can happen in this case is that we get a "bit already cleared" message from * ext4_free_inode(). The only reason we would point at a wrong inode is if * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; struct inode *inode; int i, j; #ifdef CONFIG_QUOTA int quota_update = 0; #endif __le32 *bdata; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { ext4_debug("no orphan inodes to clean up\n"); return; } if (bdev_read_only(sb->s_bdev)) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, skipping orphan cleanup"); return; } /* Check if feature set would not allow a r/w mount */ if (!ext4_feature_set_ok(sb, 0)) { ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " "unknown ROCOMPAT features"); return; } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", ret); } /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " "quota: type %d: error %d", i, ret); } } #endif while (es->s_last_orphan) { /* * We may have encountered an error during cleanup; if * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; break; } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } for (i = 0; i < oi->of_blocks; i++) { bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); for (j = 0; j < inodes_per_ob; j++) { if (!bdata[j]) continue; inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); if (IS_ERR(inode)) continue; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } } #define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", PLURAL(nr_orphans)); if (nr_truncates) ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn off quotas if they were enabled for orphan cleanup */ if (quota_update) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } } #endif sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } void ext4_release_orphan_info(struct super_block *sb) { int i; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; if (!oi->of_blocks) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( struct super_block *sb, struct buffer_head *bh) { return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)); } static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct buffer_head *bh) { __u32 calculated; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); if (!ext4_has_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } /* This gets called only when checksumming is enabled */ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { struct super_block *sb = EXT4_TRIGGER(triggers)->sb; __u32 csum; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } int ext4_init_orphan_info(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct inode *inode; int i, j; int ret; int free; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_block_tail *ot; ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); if (!ext4_has_feature_orphan_file(sb)) return 0; inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; } for (i = 0; i < oi->of_blocks; i++) { oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); if (IS_ERR(oi->of_binfo[i].ob_bh)) { ret = PTR_ERR(oi->of_binfo[i].ob_bh); goto out_free; } if (!oi->of_binfo[i].ob_bh) { ret = -EIO; goto out_free; } ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { ext4_error(sb, "orphan file block %d: bad magic", i); ret = -EIO; goto out_free; } if (!ext4_orphan_file_block_csum_verify(sb, oi->of_binfo[i].ob_bh)) { ext4_error(sb, "orphan file block %d: bad checksum", i); ret = -EIO; goto out_free; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); free = 0; for (j = 0; j < inodes_per_ob; j++) if (bdata[j] == 0) free++; atomic_set(&oi->of_binfo[i].ob_free_entries, free); } iput(inode); return 0; out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); out_put: iput(inode); return ret; } int ext4_orphan_file_empty(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int i; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!ext4_has_feature_orphan_file(sb)) return 1; for (i = 0; i < oi->of_blocks; i++) if (atomic_read(&oi->of_binfo[i].ob_free_entries) != inodes_per_ob) return 0; return 1; } |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | // SPDX-License-Identifier: GPL-2.0-or-later /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: * Fixes: */ #define pr_fmt(fmt) "UDPLite: " fmt #include <linux/export.h> #include <linux/proc_fs.h> #include "udp_impl.h" struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); /* Designate sk as UDP-Lite socket */ static int udplite_sk_init(struct sock *sk) { udp_init_sock(sk); pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); return 0; } static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } static int udplite_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, }; struct proto udplite_prot = { .name = "UDP-Lite", .owner = THIS_MODULE, .close = udp_lib_close, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udplite_sk_init, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, }; EXPORT_SYMBOL(udplite_prot); static struct inet_protosw udplite4_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }; #ifdef CONFIG_PROC_FS static struct udp_seq_afinfo udplite4_seq_afinfo = { .family = AF_INET, .udp_table = &udplite_table, }; static int __net_init udplite4_proc_init_net(struct net *net) { if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops, sizeof(struct udp_iter_state), &udplite4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udplite4_proc_exit_net(struct net *net) { remove_proc_entry("udplite", net->proc_net); } static struct pernet_operations udplite4_net_ops = { .init = udplite4_proc_init_net, .exit = udplite4_proc_exit_net, }; static __init int udplite4_proc_init(void) { return register_pernet_subsys(&udplite4_net_ops); } #else static inline int udplite4_proc_init(void) { return 0; } #endif void __init udplite4_register(void) { udp_table_init(&udplite_table, "UDP-Lite"); if (proto_register(&udplite_prot, 1)) goto out_register_err; if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0) goto out_unregister_proto; inet_register_protosw(&udplite4_protosw); if (udplite4_proc_init()) pr_err("%s: Cannot register /proc!\n", __func__); return; out_unregister_proto: proto_unregister(&udplite_prot); out_register_err: pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__); } |
36 36 36 36 36 36 36 1 36 36 36 36 36 36 36 36 36 36 36 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 | // SPDX-License-Identifier: GPL-2.0-only /* * Scanning implementation * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "mesh.h" #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9) void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { if (!bss) return; cfg80211_put_bss(local->hw.wiphy, container_of((void *)bss, struct cfg80211_bss, priv)); } static bool is_uapsd_supported(struct ieee802_11_elems *elems) { u8 qos_info; if (elems->wmm_info && elems->wmm_info_len == 7 && elems->wmm_info[5] == 1) qos_info = elems->wmm_info[6]; else if (elems->wmm_param && elems->wmm_param_len == 24 && elems->wmm_param[5] == 1) qos_info = elems->wmm_param[6]; else /* no valid wmm information or parameter element found */ return false; return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; } struct inform_bss_update_data { struct ieee80211_rx_status *rx_status; bool beacon; }; void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *cbss, const struct cfg80211_bss_ies *ies, void *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct inform_bss_update_data *update_data = data; struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; int clen, srlen; /* This happens while joining an IBSS */ if (!update_data) return; elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) return; rx_status = update_data->rx_status; if (update_data->beacon) bss->device_ts_beacon = rx_status->device_timestamp; else bss->device_ts_presp = rx_status->device_timestamp; if (elems->parse_error) { if (update_data->beacon) bss->corrupt_data |= IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data |= IEEE80211_BSS_CORRUPT_PROBE_RESP; } else { if (update_data->beacon) bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_PROBE_RESP; } /* save the ERP value so that it is available at association time */ if (elems->erp_info && (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { bss->erp_value = elems->erp_info[0]; bss->has_erp_value = true; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_ERP; } /* replace old supported rates if we get new values */ if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_RATES)) { srlen = 0; if (elems->supp_rates) { clen = IEEE80211_MAX_SUPP_RATES; if (clen > elems->supp_rates_len) clen = elems->supp_rates_len; memcpy(bss->supp_rates, elems->supp_rates, clen); srlen += clen; } if (elems->ext_supp_rates) { clen = IEEE80211_MAX_SUPP_RATES - srlen; if (clen > elems->ext_supp_rates_len) clen = elems->ext_supp_rates_len; memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen); srlen += clen; } if (srlen) { bss->supp_rates_len = srlen; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_RATES; } } if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_WMM)) { bss->wmm_used = elems->wmm_param || elems->wmm_info; bss->uapsd_supported = is_uapsd_supported(elems); if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_WMM; } if (update_data->beacon) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[rx_status->band]; if (!(rx_status->encoding == RX_ENC_HT) && !(rx_status->encoding == RX_ENC_VHT)) bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } if (elems->vht_cap_elem) bss->vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); else bss->vht_cap_info = 0; kfree(elems); } struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel) { bool beacon = ieee80211_is_beacon(mgmt->frame_control) || ieee80211_is_s1g_beacon(mgmt->frame_control); struct cfg80211_bss *cbss; struct inform_bss_update_data update_data = { .rx_status = rx_status, .beacon = beacon, }; struct cfg80211_inform_bss bss_meta = { .boottime_ns = rx_status->boottime_ns, .drv_data = (void *)&update_data, }; bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) bss_meta.signal = 0; /* invalid signal indication */ else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; bss_meta.chan = channel; rcu_read_lock(); scan_sdata = rcu_dereference(local->scan_sdata); if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && scan_sdata->vif.cfg.assoc && ieee80211_have_rx_timestamp(rx_status)) { struct ieee80211_bss_conf *link_conf = NULL; /* for an MLO connection, set the TSF data only in case we have * an indication on which of the links the frame was received */ if (ieee80211_vif_is_mld(&scan_sdata->vif)) { if (rx_status->link_valid) { s8 link_id = rx_status->link_id; link_conf = rcu_dereference(scan_sdata->vif.link_conf[link_id]); } } else { link_conf = &scan_sdata->vif.bss_conf; } if (link_conf) { bss_meta.parent_tsf = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); ether_addr_copy(bss_meta.parent_bssid, link_conf->bssid); } } rcu_read_unlock(); cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) return NULL; /* In case the signal is invalid update the status */ signal_valid = channel == cbss->channel; if (!signal_valid) rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; return (void *)cbss->priv; } static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, u32 scan_flags, const u8 *da) { if (!sdata) return false; /* accept broadcast for OCE */ if (scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP && is_broadcast_ether_addr(da)) return true; if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; return ether_addr_equal(da, sdata->vif.addr); } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); struct ieee80211_sub_if_data *sdata1, *sdata2; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); if (!ieee80211_is_probe_resp(mgmt->frame_control) && !ieee80211_is_beacon(mgmt->frame_control) && !ieee80211_is_s1g_beacon(mgmt->frame_control)) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); else min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); } if (skb->len < min_hdr_len) return; sdata1 = rcu_dereference(local->scan_sdata); sdata2 = rcu_dereference(local->sched_scan_sdata); if (likely(!sdata1 && !sdata2)) return; if (test_and_clear_bit(SCAN_BEACON_WAIT, &local->scanning)) { /* * we were passive scanning because of radar/no-IR, but * the beacon/proberesp rx gives us an opportunity to upgrade * to active scan */ set_bit(SCAN_BEACON_DONE, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; u32 scan_req_flags = 0, sched_scan_req_flags = 0; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); if (scan_req) scan_req_flags = scan_req->flags; if (sched_scan_req) sched_scan_req_flags = sched_scan_req->flags; /* ignore ProbeResp to foreign address or non-bcast (OCE) * unless scanning with randomised address */ if (!ieee80211_scan_accept_presp(sdata1, scan_req_flags, mgmt->da) && !ieee80211_scan_accept_presp(sdata2, sched_scan_req_flags, mgmt->da)) return; } channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); if (bss) ieee80211_rx_bss_put(local, bss); } static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef) { memset(chandef, 0, sizeof(*chandef)); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; } /* return false if no more work */ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen, n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } n_chans = req->n_channels; } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; local->hw_scan_req->req.channels[n_chans] = req->channels[i]; n_chans++; bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; } while (!n_chans); } local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, bands_used, req->rates, &chandef, flags); local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); ether_addr_copy(local->hw_scan_req->req.mac_addr_mask, req->mac_addr_mask); ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid); return true; } static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * It's ok to abort a not-yet-running scan (that * we have one at all will be verified by checking * local->scan_req next), but not to complete it * successfully. */ if (WARN_ON(!local->scanning && !aborted)) aborted = true; if (WARN_ON(!local->scan_req)) return; scan_sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), local->hw_scan_req); if (rc == 0) return; /* HW scan failed and is going to be reported as aborted, * so clear old scan info. */ memset(&local->scan_info, 0, sizeof(local->scan_info)); aborted = true; } kfree(local->hw_scan_req); local->hw_scan_req = NULL; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; synchronize_rcu(); if (scan_req != local->int_scan_req) { local->scan_info.aborted = aborted; cfg80211_scan_done(scan_req, &local->scan_info); } /* Set power back to normal operating levels. */ ieee80211_hw_config(local, 0); if (!hw_scan && was_scanning) { ieee80211_configure_filter(local); drv_sw_scan_complete(local, scan_sdata); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); /* Requeue all the work that might have been ignored while * the scan was in progress; if there was none this will * just be a no-op for the particular interface. */ list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } if (was_scanning) ieee80211_start_next_roc(local); } void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); trace_api_scan_completed(local, info->aborted); set_bit(SCAN_COMPLETED, &local->scanning); if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); memcpy(&local->scan_info, info, sizeof(*info)); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { /* Software scan is not supported in multi-channel cases */ if (local->use_chanctx) return -EOPNOTSUPP; /* * Hardware/driver doesn't support hw_scan, so use software * scanning instead. First send a nullfunc frame with power save * bit on so that AP will buffer the frames for us while we are not * listening, then send probe requests to each channel and wait for * the responses. After all channels are scanned, tune back to the * original channel and send a nullfunc frame with power save bit * off to trigger the AP to send us all the buffered frames. * * Note that while local->sw_scanning is true everything else but * nullfunc frames and probe requests will be dropped in * ieee80211_tx_h_check_assoc(). */ drv_sw_scan_start(local, sdata, local->scan_addr); local->leave_oper_channel_time = jiffies; local->next_scan_state = SCAN_DECISION; local->scan_channel_idx = 0; ieee80211_offchannel_stop_vifs(local); /* ensure nullfunc is transmitted before leaving operating channel */ ieee80211_flush_queues(local, NULL, false); ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ ieee80211_hw_config(local, 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); return 0; } static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_is_radar_required(local)) return true; if (!regulatory_pre_cac_allowed(local->hw.wiphy)) return false; list_for_each_entry(sdata_iter, &local->interfaces, list) { if (sdata_iter->wdev.cac_started) return false; } return true; } static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) return false; if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.flags & IEEE80211_STA_CONNECTION_POLL) return false; return true; } void ieee80211_run_deferred_scan(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!local->scan_req || local->scanning) return; if (!ieee80211_can_scan(local, rcu_dereference_protected( local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)))) return; wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, round_jiffies_relative(0)); } static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, ie, ie_len, flags); if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); for (i = 0; i < scan_req->n_ssids; i++) ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], flags, tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses * on the channel. */ *next_delay = IEEE80211_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; } static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; bool hw_scan = local->ops->hw_scan; int rc; lockdep_assert_wiphy(local->hw.wiphy); if (local->scan_req) return -EBUSY; /* For an MLO connection, if a link ID was specified, validate that it * is indeed active. If no link ID was specified, select one of the * active links. */ if (ieee80211_vif_is_mld(&sdata->vif)) { if (req->tsf_report_link_id >= 0) { if (!(sdata->vif.active_links & BIT(req->tsf_report_link_id))) return -EINVAL; } else { req->tsf_report_link_id = __ffs(sdata->vif.active_links); } } if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { /* wait for the work to finish/time out */ rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); return 0; } again: if (hw_scan) { u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; for (i = 0; i < req->n_channels; i++) { if (bands_counted & BIT(req->channels[i]->band)) continue; bands_counted |= BIT(req->channels[i]->band); n_bands++; } local->hw_scan_ies_bufsize *= n_bands; } local->hw_scan_req = kmalloc( sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]) + local->hw_scan_ies_bufsize, GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); local->hw_scan_req->req.duration = req->duration; local->hw_scan_req->req.duration_mandatory = req->duration_mandatory; local->hw_scan_req->req.tsf_report_link_id = req->tsf_report_link_id; local->hw_scan_band = 0; local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; local->hw_scan_req->req.scan_6ghz_params = req->scan_6ghz_params; local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; /* * After allocating local->hw_scan_req, we must * go through until ieee80211_prep_hw_scan(), so * anything that might be changed here and leave * this function early must not go after this * allocation. */ } rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(local->scan_addr, req->mac_addr, req->mac_addr_mask); else memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN); if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && (req->channels[0] == local->_oper_chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities */ unsigned long next_delay; __set_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); ieee80211_recalc_idle(local); /* Notify driver scan is starting, keep order of operations * same as normal software scan, in case that matters. */ drv_sw_scan_start(local, sdata, local->scan_addr); ieee80211_configure_filter(local); /* accept probe-responses */ /* We need to ensure power level is at max for scanning. */ ieee80211_hw_config(local, 0); if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; if (req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); } else { ieee80211_scan_state_send_probe(local, &next_delay); next_delay = IEEE80211_CHANNEL_TIME; } /* Now, just wait a bit and we are all done! */ wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return 0; } else { /* Do normal software scan */ __set_bit(SCAN_SW_SCANNING, &local->scanning); } ieee80211_recalc_idle(local); if (hw_scan) { WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); } if (rc) { kfree(local->hw_scan_req); local->hw_scan_req = NULL; local->scanning = 0; ieee80211_recalc_idle(local); local->scan_req = NULL; RCU_INIT_POINTER(local->scan_sdata, NULL); } if (hw_scan && rc == 1) { /* * we can't fall back to software for P2P-GO * as it must update NoA etc. */ if (ieee80211_vif_type_p2p(&sdata->vif) == NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; hw_scan = false; goto again; } return rc; } static unsigned long ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) { /* * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } static void ieee80211_scan_state_decision(struct ieee80211_local *local, unsigned long *next_delay) { bool associated = false; bool tx_empty = true; bool bad_latency; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; enum mac80211_scan_state next_scan_state; struct cfg80211_scan_request *scan_req; lockdep_assert_wiphy(local->hw.wiphy); /* * check if at least one STA interface is associated, * check if at least one STA interface has pending tx frames * and grab the lowest used beacon interval */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->u.mgd.associated) { associated = true; if (!qdisc_all_tx_empty(sdata->dev)) { tx_empty = false; break; } } } } scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); next_chan = scan_req->channels[local->scan_channel_idx]; /* * we're currently scanning a different channel, let's * see if we can scan another channel without interfering * with the current traffic situation. * * Keep good latency, do not stay off-channel more than 125 ms. */ bad_latency = time_after(jiffies + ieee80211_scan_get_channel_time(next_chan), local->leave_oper_channel_time + HZ / 8); if (associated && !tx_empty) { if (scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) next_scan_state = SCAN_ABORT; else next_scan_state = SCAN_SUSPEND; } else if (associated && bad_latency) { next_scan_state = SCAN_SUSPEND; } else { next_scan_state = SCAN_SET_CHANNEL; } local->next_scan_state = next_scan_state; *next_delay = 0; } static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, unsigned long *next_delay) { int skip; struct ieee80211_channel *chan; struct cfg80211_scan_request *scan_req; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); skip = 0; chan = scan_req->channels[local->scan_channel_idx]; local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; /* For scanning on the S1G band, detect the channel width according to * the channel being scanned. */ if (chan->band == NL80211_BAND_S1GHZ) { local->scan_chandef.width = ieee80211_s1g_channel_width(chan); goto set_channel; } /* If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->_oper_chandef.chan) local->scan_chandef = local->_oper_chandef; else local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL)) skip = 1; /* advance state machine to next channel/band */ local->scan_channel_idx++; if (skip) { /* if we skip this channel return to the decision state */ local->next_scan_state = SCAN_DECISION; return; } /* * Probe delay is used to update the NAV, cf. 11.1.3.2.2 * (which unfortunately doesn't say _why_ step a) is done, * but it waits for the probe delay or until a frame is * received - and the received frame would update the NAV). * For now, we do not support waiting until a frame is * received. * * In any case, it is not necessary for a passive scan. */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { *next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); return; } /* active scan, send probes */ *next_delay = IEEE80211_PROBE_DELAY; local->next_scan_state = SCAN_SEND_PROBE; } static void ieee80211_scan_state_suspend(struct ieee80211_local *local, unsigned long *next_delay) { /* switch back to the operating channel */ local->scan_chandef.chan = NULL; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); /* disable PS */ ieee80211_offchannel_return(local); *next_delay = HZ / 5; /* afterwards, resume scan & go to next channel */ local->next_scan_state = SCAN_RESUME; } static void ieee80211_scan_state_resume(struct ieee80211_local *local, unsigned long *next_delay) { ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { ieee80211_flush_queues(local, NULL, false); *next_delay = 0; } else *next_delay = HZ / 10; /* remember when we left the operating channel */ local->leave_oper_channel_time = jiffies; /* advance to the next channel to be scanned */ local->next_scan_state = SCAN_SET_CHANNEL; } void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; unsigned long next_delay = 0; bool aborted; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_can_run_worker(local)) { aborted = true; goto out_complete; } sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); /* When scanning on-channel, the first-callback means completed. */ if (test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (!sdata || !scan_req) return; if (!local->scanning) { int rc; RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); rc = __ieee80211_start_scan(sdata, scan_req); if (!rc) return; /* need to complete scan in cfg80211 */ rcu_assign_pointer(local->scan_req, scan_req); aborted = true; goto out_complete; } clear_bit(SCAN_BEACON_WAIT, &local->scanning); /* * as long as no delay is required advance immediately * without scheduling a new work */ do { if (!ieee80211_sdata_running(sdata)) { aborted = true; goto out_complete; } if (test_and_clear_bit(SCAN_BEACON_DONE, &local->scanning) && local->next_scan_state == SCAN_DECISION) local->next_scan_state = SCAN_SEND_PROBE; switch (local->next_scan_state) { case SCAN_DECISION: /* if no more bands/channels left, complete scan */ if (local->scan_channel_idx >= scan_req->n_channels) { aborted = false; goto out_complete; } ieee80211_scan_state_decision(local, &next_delay); break; case SCAN_SET_CHANNEL: ieee80211_scan_state_set_channel(local, &next_delay); break; case SCAN_SEND_PROBE: ieee80211_scan_state_send_probe(local, &next_delay); break; case SCAN_SUSPEND: ieee80211_scan_state_suspend(local, &next_delay); break; case SCAN_RESUME: ieee80211_scan_state_resume(local, &next_delay); break; case SCAN_ABORT: aborted = true; goto out_complete; } } while (next_delay == 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return; out_complete: __ieee80211_scan_completed(&local->hw, aborted); } int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { lockdep_assert_wiphy(sdata->local->hw.wiphy); return __ieee80211_start_scan(sdata, req); } int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels) { struct ieee80211_local *local = sdata->local; int ret = -EBUSY, i, n_ch = 0; enum nl80211_band band; lockdep_assert_wiphy(local->hw.wiphy); /* busy scanning */ if (local->scan_req) goto unlock; /* fill internal scan request */ if (!channels) { int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || band == NL80211_BAND_6GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; for (i = 0; i < max_n; i++) { struct ieee80211_channel *tmp_ch = &local->hw.wiphy->bands[band]->channels[i]; if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED)) continue; local->int_scan_req->channels[n_ch] = tmp_ch; n_ch++; } } if (WARN_ON_ONCE(n_ch == 0)) goto unlock; local->int_scan_req->n_channels = n_ch; } else { for (i = 0; i < n_channels; i++) { if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED)) continue; local->int_scan_req->channels[n_ch] = channels[i]; n_ch++; } if (WARN_ON_ONCE(n_ch == 0)) goto unlock; local->int_scan_req->n_channels = n_ch; } local->int_scan_req->ssids = &local->scan_ssid; local->int_scan_req->n_ssids = 1; memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req); unlock: return ret; } void ieee80211_scan_cancel(struct ieee80211_local *local) { /* ensure a new scan cannot be queued */ lockdep_assert_wiphy(local->hw.wiphy); /* * We are canceling software scan, or deferred scan that was not * yet really started (see __ieee80211_start_scan ). * * Regarding hardware scan: * - we can not call __ieee80211_scan_completed() as when * SCAN_HW_SCANNING bit is set this function change * local->hw_scan_req to operate on 5G band, what race with * driver which can use local->hw_scan_req * * - we can not cancel scan_work since driver can schedule it * by ieee80211_scan_completed(..., true) to finish scan * * Hence we only call the cancel_hw_scan() callback, but the low-level * driver is still responsible for calling ieee80211_scan_completed() * after the scan was completed/aborted. */ if (!local->scan_req) return; /* * We have a scan running and the driver already reported completion, * but the worker hasn't run yet or is stuck on the mutex - mark it as * cancelled. */ if (test_bit(SCAN_HW_SCANNING, &local->scanning) && test_bit(SCAN_COMPLETED, &local->scanning)) { set_bit(SCAN_HW_CANCELLED, &local->scanning); return; } if (test_bit(SCAN_HW_SCANNING, &local->scanning)) { /* * Make sure that __ieee80211_scan_completed doesn't trigger a * scan on another band. */ set_bit(SCAN_HW_CANCELLED, &local->scanning); if (local->ops->cancel_hw_scan) drv_cancel_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx))); return; } wiphy_delayed_work_cancel(local->hw.wiphy, &local->scan_work); /* and clean up */ memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); } int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; u32 flags = 0; lockdep_assert_wiphy(local->hw.wiphy); iebufsz = local->scan_ies_len + req->ie_len; if (!local->ops->sched_scan_start) return -EOPNOTSUPP; for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; num_bands++; } } if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; goto out; } ieee80211_prepare_scan_chandef(&chandef); ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); rcu_assign_pointer(local->sched_scan_req, req); } kfree(ie); out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); } return ret; } int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (rcu_access_pointer(local->sched_scan_sdata)) return -EBUSY; return __ieee80211_request_sched_scan_start(sdata, req); } int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sched_scan_sdata; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata) { ret = drv_sched_scan_stop(local, sched_scan_sdata); if (!ret) RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } return ret; } void ieee80211_sched_scan_results(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_results(local); cfg80211_sched_scan_results(hw->wiphy, 0); } EXPORT_SYMBOL(ieee80211_sched_scan_results); void ieee80211_sched_scan_end(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!rcu_access_pointer(local->sched_scan_sdata)) return; RCU_INIT_POINTER(local->sched_scan_sdata, NULL); /* If sched scan was aborted by the driver. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, sched_scan_stopped_work); ieee80211_sched_scan_end(local); } void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_stopped(local); /* * this shouldn't really happen, so for simplicity * simply ignore it, and let mac80211 reconfigure * the sched scan later on. */ if (local->in_reconfig) return; wiphy_work_queue(hw->wiphy, &local->sched_scan_stopped_work); } EXPORT_SYMBOL(ieee80211_sched_scan_stopped); |
950 950 826 146 43 1725 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> #include <net/netdev_rx_queue.h> #include <net/busy_poll.h> #include "netdev-genl-gen.h" #include "dev.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; #define XDP_METADATA_KFUNC(_, flag, __, xmo) \ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ xdp_rx_meta |= flag; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC if (netdev->xsk_tx_metadata_ops) { if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, xsk_features, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, netdev->xdp_zc_max_segs)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } } genlmsg_end(rsp, hdr); return 0; } static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); } int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_device *netdev; struct sk_buff *rsp; u32 ifindex; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) err = netdev_nl_dev_fill(netdev, rsp, info); else err = -ENODEV; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; int err = 0; rtnl_lock(); for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; } rtnl_unlock(); if (err != -EMSGSIZE) return err; return skb->len; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { void *hdr; pid_t pid; if (WARN_ON_ONCE(!napi->dev)) return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (napi->napi_id >= MIN_NAPI_ID && nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) goto nla_put_failure; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; struct sk_buff *rsp; u32 napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); napi = napi_by_id(napi_id); if (napi) err = netdev_nl_napi_fill_one(rsp, napi, info); else err = -EINVAL; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; int err = 0; if (!(netdev->flags & IFF_UP)) return err; list_for_each_entry(napi, &netdev->napi_list, dev_list) { if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; err = netdev_nl_napi_fill_one(rsp, napi, info); if (err) return err; ctx->napi_id = napi->napi_id; } return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev) err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); else err = -ENODEV; } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } rtnl_unlock(); if (err != -EMSGSIZE) return err; return skb->len; } static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, txq->napi->napi_id)) goto nla_put_failure; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, u32 q_type) { switch (q_type) { case NETDEV_QUEUE_TYPE_RX: if (q_id >= netdev->real_num_rx_queues) return -EINVAL; return 0; case NETDEV_QUEUE_TYPE_TX: if (q_id >= netdev->real_num_tx_queues) return -EINVAL; } return 0; } static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { int err = 0; if (!(netdev->flags & IFF_UP)) return err; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) return err; return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); } int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 q_id, q_type, ifindex; struct net_device *netdev; struct sk_buff *rsp; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) return -EINVAL; q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); else err = -ENODEV; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { int err = 0; int i; if (!(netdev->flags & IFF_UP)) return err; for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { err = netdev_nl_queue_fill_one(rsp, netdev, i, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; ctx->rxq_idx = i++; } for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { err = netdev_nl_queue_fill_one(rsp, netdev, i, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; ctx->txq_idx = i++; } return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev) err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); else err = -ENODEV; } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->rxq_idx = 0; ctx->txq_idx = 0; } } rtnl_unlock(); if (err != -EMSGSIZE) return err; return skb->len; } static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); break; case NETDEV_UNREGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); break; } return NOTIFY_OK; } static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; static int __init netdev_genl_init(void) { int err; err = register_netdevice_notifier(&netdev_genl_nb); if (err) return err; err = genl_register_family(&netdev_nl_family); if (err) goto err_unreg_ntf; return 0; err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err; } subsys_initcall(netdev_genl_init); |
616 616 112 616 616 112 615 615 615 615 614 614 614 614 614 614 615 615 615 615 103 103 615 615 615 171 615 615 615 103 103 103 615 179 615 615 615 615 615 615 615 22 22 616 615 616 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; if (rac) folio = readahead_folio(rac); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk + map_offset + relative_block; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_set_error(folio); folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (page_block && blocks[page_block-1] != map.m_pblk-1) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } } if (first_hole != blocks_per_page) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_end_read(folio, true); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } |
1717 1717 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 | // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_taprio.c Time Aware Priority Scheduler * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> * */ #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/tcp.h> #define TAPRIO_STAT_NOT_SET (~0ULL) #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); static struct static_key_false taprio_have_broken_mqprio; static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes */ u64 gate_duration[TC_MAX_QUEUE]; atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time */ ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */ ktime_t end_time; ktime_t next_txtime; int index; u32 gate_mask; u32 interval; u8 command; }; struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule. */ u64 max_open_gate_duration[TC_MAX_QUEUE]; u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; }; struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; struct __tc_taprio_qopt_offload { refcount_t users; struct tc_taprio_qopt_offload offload; }; static void taprio_calculate_gate_durations(struct taprio_sched *q, struct sched_gate_list *sched) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *cur; int tc; list_for_each_entry(entry, &sched->entries, list) { u32 gates_still_open = entry->gate_mask; /* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC. */ cur = entry; do { if (!gates_still_open) break; for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue; if (cur->gate_mask & BIT(tc)) entry->gate_duration[tc] += cur->interval; else gates_still_open &= ~BIT(tc); } cur = list_next_entry_circular(cur, &sched->entries, list); } while (cur != entry); /* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed. */ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] && sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; } } static bool taprio_entry_allows_tx(ktime_t skb_end_time, struct sched_entry *entry, int tc) { return ktime_before(skb_end_time, entry->gate_close_time[tc]); } static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) return KTIME_MAX; return ns_to_ktime(sched->base_time); } static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); switch (tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, tk_offset); } } static ktime_t taprio_get_time(const struct taprio_sched *q) { return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); } kfree(sched); } static void switch_schedules(struct taprio_sched *q, struct sched_gate_list **admin, struct sched_gate_list **oper) { rcu_assign_pointer(q->oper_sched, *admin); rcu_assign_pointer(q->admin_sched, NULL); if (*oper) call_rcu(&(*oper)->rcu, taprio_free_sched_cb); *oper = *admin; *admin = NULL; } /* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) { ktime_t time_since_sched_start; s32 time_elapsed; time_since_sched_start = ktime_sub(time, sched->base_time); div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); return time_elapsed; } static ktime_t get_interval_end_time(struct sched_gate_list *sched, struct sched_gate_list *admin, struct sched_entry *entry, ktime_t intv_start) { s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); ktime_t intv_end, cycle_ext_end, cycle_end; cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); intv_end = ktime_add_ns(intv_start, entry->interval); cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); if (ktime_before(intv_end, cycle_end)) return intv_end; else if (admin && admin != sched && ktime_after(admin->base_time, cycle_end) && ktime_before(admin->base_time, cycle_ext_end)) return admin->base_time; else return cycle_end; } static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } static int duration_to_length(struct taprio_sched *q, u64 duration) { return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); } /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed. */ static void taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); u32 max_sdu_from_user; u32 max_sdu_dynamic; u32 max_sdu; int tc; for (tc = 0; tc < num_tc; tc++) { max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; /* TC gate never closes => keep the queueMaxSDU * selected by the user */ if (sched->max_open_gate_duration[tc] == sched->cycle_time) { max_sdu_dynamic = U32_MAX; } else { u32 max_frm_len; max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative */ if (stab) { max_frm_len -= stab->szopts.overhead; max_frm_len = max_t(int, max_frm_len, dev->hard_header_len + 1); } max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu) max_sdu_dynamic = U32_MAX; } max_sdu = min(max_sdu_dynamic, max_sdu_from_user); if (max_sdu != U32_MAX) { sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; sched->max_sdu[tc] = max_sdu; } else { sched->max_frm_len[tc] = U32_MAX; /* never oversized */ sched->max_sdu[tc] = 0; } } } /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. */ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, struct Qdisc *sch, struct sched_gate_list *sched, struct sched_gate_list *admin, ktime_t time, ktime_t *interval_start, ktime_t *interval_end, bool validate_interval) { ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; struct sched_entry *entry = NULL, *entry_found = NULL; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool entry_available = false; s32 cycle_elapsed; int tc, n; tc = netdev_get_prio_tc_map(dev, skb->priority); packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); *interval_start = 0; *interval_end = 0; if (!sched) return NULL; cycle = sched->cycle_time; cycle_elapsed = get_cycle_time_elapsed(sched, time); curr_intv_end = ktime_sub_ns(time, cycle_elapsed); cycle_end = ktime_add_ns(curr_intv_end, cycle); list_for_each_entry(entry, &sched->entries, list) { curr_intv_start = curr_intv_end; curr_intv_end = get_interval_end_time(sched, admin, entry, curr_intv_start); if (ktime_after(curr_intv_start, cycle_end)) break; if (!(entry->gate_mask & BIT(tc)) || packet_transmit_time > entry->interval) continue; txtime = entry->next_txtime; if (ktime_before(txtime, time) || validate_interval) { transmit_end_time = ktime_add_ns(time, packet_transmit_time); if ((ktime_before(curr_intv_start, time) && ktime_before(transmit_end_time, curr_intv_end)) || (ktime_after(curr_intv_start, time) && !validate_interval)) { entry_found = entry; *interval_start = curr_intv_start; *interval_end = curr_intv_end; break; } else if (!entry_available && !validate_interval) { /* Here, we are just trying to find out the * first available interval in the next cycle. */ entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); } } else if (ktime_before(txtime, earliest_txtime) && !entry_available) { earliest_txtime = txtime; entry_found = entry; n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); *interval_start = ktime_add(curr_intv_start, n * cycle); *interval_end = ktime_add(curr_intv_end, n * cycle); } } return entry_found; } static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t interval_start, interval_end; struct sched_entry *entry; rcu_read_lock(); sched = rcu_dereference(q->oper_sched); admin = rcu_dereference(q->admin_sched); entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, &interval_start, &interval_end, true); rcu_read_unlock(); return entry; } static bool taprio_flags_valid(u32 flags) { /* Make sure no other flag bits are set. */ if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) return false; /* txtime-assist and full offload are mutually exclusive */ if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) return false; return true; } /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct iphdr *iph; struct ipv6hdr _ipv6h; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return 0; } else if (iph->protocol != IPPROTO_TCP) { return 0; } } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { return 0; } return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window. */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed; now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); tcp_tstamp = get_tcp_tstamp(q, skb); minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched); if (admin && ktime_after(minimum_time, admin->base_time)) switch_schedules(q, &admin, &sched); /* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) { txtime = minimum_time; goto done; } len = qdisc_pkt_len(skb); packet_transmit_time = length_to_duration(q, len); do { sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, &interval_start, &interval_end, false); if (!entry) { txtime = 0; goto done; } txtime = entry->next_txtime; txtime = max_t(ktime_t, txtime, minimum_time); txtime = max_t(ktime_t, txtime, interval_start); if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; sched_changed = true; continue; } transmit_end_time = ktime_add(txtime, packet_transmit_time); minimum_time = transmit_end_time; /* Update the txtime of current entry to the next time it's * interval starts. */ if (ktime_after(transmit_end_time, interval_end)) entry->next_txtime = ktime_add(interval_start, sched->cycle_time); } while (sched_changed || ktime_after(transmit_end_time, interval_end)); entry->next_txtime = transmit_end_time; done: rcu_read_unlock(); return txtime; } /* Devices with full offload are expected to honor this in hardware */ static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false; u8 tc; tc = netdev_get_prio_tc_map(dev, prio); rcu_read_lock(); sched = rcu_dereference(q->oper_sched); if (sched && skb->len > sched->max_frm_len[tc]) exceeds = true; rcu_read_unlock(); return exceeds; } static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free); } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; int ret; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; slen += segs->len; /* FIXME: we should be segmenting to a smaller size * rather than dropping these */ if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) ret = qdisc_drop(segs, sch, to_free); else ret = taprio_enqueue_one(segs, sch, child, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { numsegs++; } } if (numsegs > 1) qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); consume_skb(skb); return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue; queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware. */ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child, to_free); return qdisc_drop(skb, sch, to_free); } return taprio_enqueue_one(skb, sch, child, to_free); } static struct sk_buff *taprio_peek(struct Qdisc *sch) { WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); return NULL; } static void taprio_set_budgets(struct taprio_sched *q, struct sched_gate_list *sched, struct sched_entry *entry) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); int tc, budget; for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time) budget = INT_MAX; else budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); atomic_set(&entry->budget[tc], budget); } } /* When an skb is sent, it consumes from the budget of all traffic classes */ static int taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc) { int tc, budget, new_budget = 0; for (tc = 0; tc < num_tc; tc++) { budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed) new_budget = budget; continue; } if (tc == tc_consumed) new_budget = atomic_sub_return(len, &entry->budget[tc]); else atomic_sub(len, &entry->budget[tc]); } return new_budget; } static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct Qdisc *child = q->qdiscs[txq]; int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; ktime_t guard; int prio; int len; u8 tc; if (unlikely(!child)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) goto skip_peek_checks; skb = child->ops->peek(child); if (!skb) return NULL; prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) return NULL; len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && !taprio_entry_allows_tx(guard, entry, tc)) return NULL; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL; skip_peek_checks: skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) { int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count; (*txq)++; if (*txq == offset + count) *txq = offset; } /* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin */ static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc; for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; do { skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], entry, gate_mask); taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); if (q->cur_txq[tc] >= dev->num_tx_queues) q->cur_txq[tc] = first_txq; if (skb) return skb; } while (q->cur_txq[tc] != first_txq); } return NULL; } /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not */ static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i; for (i = 0; i < dev->num_tx_queues; i++) { skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb; } return NULL; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done; if (static_branch_unlikely(&taprio_have_broken_mqprio) && !static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); } else if (static_branch_likely(&taprio_have_working_mqprio) && !static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio) skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: rcu_read_unlock(); return skb; } static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { if (list_is_last(&entry->list, &oper->entries)) return true; if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; } static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t end_time) { ktime_t next_base_time, extension_time; if (!admin) return false; next_base_time = sched_base_time(admin); /* This is the simple case, the end_time would fall after * the next schedule base_time. */ if (ktime_compare(next_base_time, end_time) <= 0) return true; /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change. */ if (ktime_compare(next_base_time, extension_time) <= 0) return true; return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t end_time; int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); oper = rcu_dereference_protected(q->oper_sched, lockdep_is_held(&q->current_entry_lock)); admin = rcu_dereference_protected(q->admin_sched, lockdep_is_held(&q->current_entry_lock)); if (!oper) switch_schedules(q, &admin, &oper); /* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization. */ if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, oper->cycle_time); } else { next = list_next_entry(entry, list); } end_time = ktime_add_ns(entry->end_time, next->interval); end_time = min_t(ktime_t, end_time, oper->cycle_end_time); for (tc = 0; tc < num_tc; tc++) { if (next->gate_duration[tc] == oper->cycle_time) next->gate_close_time[tc] = KTIME_MAX; else next->gate_close_time[tc] = ktime_add_ns(entry->end_time, next->gate_duration[tc]); } if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } next->end_time = end_time; taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); rcu_read_unlock(); return HRTIMER_RESTART; } static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct netlink_range_validation_signed taprio_cycle_time_range = { .min = 0, .max = INT_MAX, }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, struct sched_entry *entry, struct netlink_ext_ack *extack) { int min_duration = length_to_duration(q, ETH_ZLEN); u32 interval = 0; if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) entry->command = nla_get_u8( tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) entry->gate_mask = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) interval = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); /* The interval should allow at least the minimum ethernet * frame to go out. */ if (interval < min_duration) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; return 0; } static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, struct sched_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_sched_entry(q, tb, entry, extack); } static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; int err, rem; int i = 0; if (!list) return -EINVAL; nla_for_each_nested(n, list, rem) { struct sched_entry *entry; if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } err = parse_sched_entry(q, n, entry, i, extack); if (err < 0) { kfree(entry); return err; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; } static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, struct sched_gate_list *new, struct netlink_ext_ack *extack) { int err = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); return -ENOTSUPP; } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); if (err < 0) return err; if (!new->cycle_time) { struct sched_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); if (!cycle) { NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); return -EINVAL; } if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; } new->cycle_time = cycle; } taprio_calculate_gate_durations(q, new); return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack, u32 taprio_flags) { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt && !dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } /* If num_tc is already set, it means that the user already * configured the mqprio part */ if (dev->num_tc) return 0; /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } /* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges. */ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, extack); } static int taprio_get_start_time(struct Qdisc *sch, struct sched_gate_list *sched, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); ktime_t now, base, cycle; s64 n; base = sched_base_time(sched); now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error. */ if (WARN_ON(!cycle)) return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); return 0; } static void setup_first_end_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_end_time = ktime_add_ns(base, cycle); first->end_time = ktime_add_ns(base, first->interval); taprio_set_budgets(q, sched, first); for (tc = 0; tc < num_tc; tc++) { if (first->gate_duration[tc] == sched->cycle_time) first->gate_close_time[tc] = KTIME_MAX; else first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); } rcu_assign_pointer(q->current_entry, NULL); } static void taprio_start_sched(struct Qdisc *sch, ktime_t start, struct sched_gate_list *new) { struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) return; expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; /* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time. */ start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->picos_per_byte), ecmd.base.speed); } static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sched_gate_list *oper, *admin; struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; list_for_each_entry(q, &taprio_list, taprio_list) { if (dev != qdisc_dev(q->root)) continue; taprio_set_picos_per_byte(dev, q); stab = rtnl_dereference(q->root->stab); oper = rtnl_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); admin = rtnl_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); break; } return NOTIFY_DONE; } static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); interval += entry->interval; } } static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), GFP_KERNEL); if (!__offload) return NULL; refcount_set(&__offload->users, 1); return &__offload->offload; } struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); refcount_inc(&__offload->users); return offload; } EXPORT_SYMBOL_GPL(taprio_offload_get); void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); if (!refcount_dec_and_test(&__offload->users)) return; kfree(__offload); } EXPORT_SYMBOL_GPL(taprio_offload_free); /* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) { u32 i, queue_mask = 0; for (i = 0; i < dev->num_tc; i++) { u32 offset, count; if (!(tc_mask & BIT(i))) continue; offset = dev->tc_to_txq[i].offset; count = dev->tc_to_txq[i].count; queue_mask |= GENMASK(offset + count - 1, offset); } return queue_mask; } static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, struct tc_taprio_qopt_offload *offload, const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; offload->base_time = sched->base_time; offload->cycle_time = sched->cycle_time; offload->cycle_time_extension = sched->cycle_time_extension; list_for_each_entry(entry, &sched->entries, list) { struct tc_taprio_sched_entry *e = &offload->entries[i]; e->command = entry->command; e->interval = entry->interval; if (caps->gate_mask_per_txq) e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); else e->gate_mask = entry->gate_mask; i++; } offload->num_entries = i; } static void taprio_detect_broken_mqprio(struct taprio_sched *q) { struct net_device *dev = qdisc_dev(q->root); struct tc_taprio_caps caps; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); q->broken_mqprio = caps.broken_mqprio; if (q->broken_mqprio) static_branch_inc(&taprio_have_broken_mqprio); else static_branch_inc(&taprio_have_working_mqprio); q->detected_mqprio = true; } static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) { if (!q->detected_mqprio) return; if (q->broken_mqprio) static_branch_dec(&taprio_have_broken_mqprio); else static_branch_dec(&taprio_have_working_mqprio); } static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; struct tc_taprio_caps caps; int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support taprio offload"); return -EOPNOTSUPP; } qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); if (!caps.supports_queue_max_sdu) { for (tc = 0; tc < TC_MAX_QUEUE; tc++) { if (q->max_sdu[tc]) { NL_SET_ERR_MSG_MOD(extack, "Device does not handle queueMaxSDU"); return -EOPNOTSUPP; } } } offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory for enabling offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done; } q->offloaded = true; done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid */ offload->extack = NULL; offload->mqprio.extack = NULL; taprio_offload_free(offload); return err; } static int taprio_disable_offload(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; int err; if (!q->offloaded) return 0; offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out; } q->offloaded = false; out: taprio_offload_free(offload); return err; } /* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory. */ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out; } if (ops && ops->get_ts_info) err = ops->get_ts_info(dev, &info); if (err || info.phc_index < 0) { NL_SET_ERR_MSG(extack, "Device does not have a PTP clock"); err = -ENOTSUPP; goto out; } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. */ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) { NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); err = -ENOTSUPP; goto out; } switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } /* This pairs with READ_ONCE() in taprio_mono_to_any */ WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } /* Everything went ok, return success. */ err = 0; out: return err; } static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); int err, tc; u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); if (err < 0) return err; if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) { NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE; } max_sdu[tc] = val; } if (tb[TCA_TAPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } static int taprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; fp[tc] = q->fp[tc]; } nla_for_each_nested(n, opt, rem) { if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) continue; err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) return err; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; q->fp[tc] = fp[tc]; if (fp[tc] != TC_FP_EXPRESS) have_preemption = true; } if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP; } if (!ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } } return err; } static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { int i; if (!mqprio || mqprio->num_tc != dev->num_tc) return -1; for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] || dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1; for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1; return 0; } /* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ static int taprio_new_flags(const struct nlattr *attr, u32 old, struct netlink_ext_ack *extack) { u32 new = 0; if (attr) new = nla_get_u32(attr); if (old != TAPRIO_FLAGS_INVALID && old != new) { NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP; } if (!taprio_flags_valid(new)) { NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); return -EINVAL; } return new; } static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; ktime_t start; int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); if (err < 0) return err; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], q->flags, extack); if (err < 0) return err; q->flags = err; err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err; new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; } INIT_LIST_HEAD(&new_admin->entries); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) mqprio = NULL; if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; goto free_sched; } if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); q->cur_txq[i] = mqprio->offset[i]; } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; if (new_admin->num_entries == 0) { NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); err = -EINVAL; goto free_sched; } err = taprio_parse_clockid(sch, tb, extack); if (err < 0) goto free_sched; taprio_set_picos_per_byte(dev, q); taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); err = -EINVAL; goto unlock; } q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock; } setup_txtime(q, new_admin, start); if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; new_admin = NULL; goto unlock; } rcu_assign_pointer(q->admin_sched, new_admin); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); taprio_start_sched(sch, start, new_admin); rcu_assign_pointer(q->admin_sched, new_admin); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } new_admin = NULL; err = 0; if (!stab) NL_SET_ERR_MSG_MOD(extack, "Size table not specified, frame length estimations may be inaccurate"); unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: if (new_admin) call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } static void taprio_reset(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i; hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) qdisc_reset(q->qdiscs[i]); } } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; unsigned int i; list_del(&q->taprio_list); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); } q->qdiscs = NULL; netdev_reset_tc(dev); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper) call_rcu(&oper->rcu, taprio_free_sched_cb); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i, tc; spin_lock_init(&q->current_entry_lock); hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; list_add(&q->taprio_list, &taprio_list); if (sch->parent != TC_H_ROOT) { NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; } if (!netif_is_multiqueue(dev)) { NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; } q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); if (!q->qdiscs) return -ENOMEM; if (!opt) return -EINVAL; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; if (i < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); q->qdiscs[i] = qdisc; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->fp[tc] = TC_FP_EXPRESS; taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); } static void taprio_attach(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; /* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ dev_queue_qdisc = sch; } old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto */ qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int taprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children. */ *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new) qdisc_refcount_inc(new); if (*old) qdisc_put(*old); } q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_entry(struct sk_buff *msg, const struct sched_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, entry->gate_mask)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; return nla_nest_end(msg, item); nla_put_failure: nla_nest_cancel(msg, item); return -1; } static int dump_schedule(struct sk_buff *msg, const struct sched_gate_list *root) { struct nlattr *entry_list; struct sched_entry *entry; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, root->base_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, root->cycle_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, root->cycle_time_extension, TCA_TAPRIO_PAD)) return -1; entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) goto error_nest; list_for_each_entry(entry, &root->entries, list) { if (dump_entry(msg, entry) < 0) goto error_nest; } nla_nest_end(msg, entry_list); return 0; error_nest: nla_nest_cancel(msg, entry_list); return -1; } static int taprio_dump_tc_entries(struct sk_buff *skb, struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; int tc; for (tc = 0; tc < TC_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, sched->max_sdu[tc])) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == TAPRIO_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) return -EMSGSIZE; return 0; } static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, struct tc_taprio_qopt_offload *offload, struct tc_taprio_qopt_stats *stats) { struct net_device *dev = qdisc_dev(sch); const struct net_device_ops *ops; struct sk_buff *skb = d->skb; struct nlattr *xstats; int err; ops = qdisc_dev(sch)->netdev_ops; /* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself */ if (!ops->ndo_setup_tc) return 0; memset(stats, 0xff, sizeof(*stats)); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err; xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err; if (taprio_put_stat(skb, stats->window_drops, TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || taprio_put_stat(skb, stats->tx_overruns, TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel; nla_nest_end(skb, xstats); return 0; err_cancel: nla_nest_cancel(skb, xstats); err: return -EMSGSIZE; } static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_STATS, }; return taprio_dump_xstats(sch, d, &offload, &offload.stats); } static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) goto options_error; if (q->txtime_delay && nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error; if (oper && dump_schedule(skb, oper)) goto options_error; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) goto options_error; if (dump_schedule(skb, admin)) goto admin_error; nla_nest_end(skb, sched_nest); done: return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); options_error: nla_nest_cancel(skb, nest); start_error: return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!taprio_queue_get(sch, ntx)) return 0; return ntx; } static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = child->handle; return 0; } static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops taprio_class_ops = { .graft = taprio_graft, .leaf = taprio_leaf, .find = taprio_find, .walk = taprio_walk, .dump = taprio_dump_class, .dump_stats = taprio_dump_class_stats, .select_queue = taprio_select_queue, }; static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .cl_ops = &taprio_class_ops, .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, }; static int __init taprio_module_init(void) { int err = register_netdevice_notifier(&taprio_device_notifier); if (err) return err; return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Time Aware Priority qdisc"); |
11451 18 43 1 43 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor file mediation function definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_FILE_H #define __AA_FILE_H #include <linux/spinlock.h> #include "domain.h" #include "match.h" #include "perms.h" struct aa_policydb; struct aa_profile; struct path; #define mask_mode_t(X) (X & (MAY_EXEC | MAY_WRITE | MAY_READ | MAY_APPEND)) #define AA_AUDIT_FILE_MASK (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_APPEND |\ AA_MAY_CREATE | AA_MAY_DELETE | \ AA_MAY_GETATTR | AA_MAY_SETATTR | \ AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_LOCK | \ AA_EXEC_MMAP | AA_MAY_LINK) static inline struct aa_file_ctx *file_ctx(struct file *file) { return file->f_security + apparmor_blob_sizes.lbs_file; } /* struct aa_file_ctx - the AppArmor context the file was opened in * @lock: lock to update the ctx * @label: label currently cached on the ctx * @perms: the permission the file was opened with */ struct aa_file_ctx { spinlock_t lock; struct aa_label __rcu *label; u32 allow; }; /* * The xindex is broken into 3 parts * - index - an index into either the exec name table or the variable table * - exec type - which determines how the executable name and index are used * - flags - which modify how the destination name is applied */ #define AA_X_INDEX_MASK AA_INDEX_MASK #define AA_X_TYPE_MASK 0x0c000000 #define AA_X_NONE AA_INDEX_NONE #define AA_X_NAME 0x04000000 /* use executable name px */ #define AA_X_TABLE 0x08000000 /* use a specified name ->n# */ #define AA_X_UNSAFE 0x10000000 #define AA_X_CHILD 0x20000000 #define AA_X_INHERIT 0x40000000 #define AA_X_UNCONFINED 0x80000000 /* need to make conditional which ones are being set */ struct path_cond { kuid_t uid; umode_t mode; }; #define COMBINED_PERM_MASK(X) ((X).allow | (X).audit | (X).quiet | (X).kill) int aa_audit_file(const struct cred *cred, struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error); struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, aa_state_t state, struct path_cond *cond); aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms); int aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond); int aa_path_link(const struct cred *subj_cred, struct aa_label *label, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry); int aa_file_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct file *file, u32 request, bool in_atomic); void aa_inherit_files(const struct cred *cred, struct files_struct *files); /** * aa_map_file_perms - map file flags to AppArmor permissions * @file: open file to map flags to AppArmor permissions * * Returns: apparmor permission set for the file */ static inline u32 aa_map_file_to_perms(struct file *file) { int flags = file->f_flags; u32 perms = 0; if (file->f_mode & FMODE_WRITE) perms |= MAY_WRITE; if (file->f_mode & FMODE_READ) perms |= MAY_READ; if ((flags & O_APPEND) && (perms & MAY_WRITE)) perms = (perms & ~MAY_WRITE) | MAY_APPEND; /* trunc implies write permission */ if (flags & O_TRUNC) perms |= MAY_WRITE; if (flags & O_CREAT) perms |= AA_MAY_CREATE; return perms; } #endif /* __AA_FILE_H */ |
10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 | // SPDX-License-Identifier: GPL-2.0-only /* * Common interrupt code for 32 and 64 bit */ #include <linux/cpu.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/of.h> #include <linux/seq_file.h> #include <linux/smp.h> #include <linux/ftrace.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/irq.h> #include <asm/irq_stack.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/irq.h> #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/desc.h> #include <asm/traps.h> #include <asm/thermal.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); atomic_t irq_err_count; /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. */ void ack_bad_irq(unsigned int irq) { if (printk_ratelimit()) pr_err("unexpected IRQ trap at vector %02x\n", irq); /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N * irq slots per priority level, and a 'hanging, unacked' IRQ * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. * But only ack when the APIC is enabled -AK */ apic_eoi(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) /* * /proc/interrupts printing for arch specific interrupts */ int arch_show_interrupts(struct seq_file *p, int prec) { int j; seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); seq_puts(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_puts(p, " Local timer interrupts\n"); seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_puts(p, " Spurious interrupts\n"); seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_puts(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); seq_puts(p, " IRQ work interrupts\n"); seq_printf(p, "%*s: ", prec, "RTR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_puts(p, " APIC ICR read retries\n"); if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_puts(p, " Platform interrupts\n"); } #endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_puts(p, " TLB shootdowns\n"); #endif #ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_puts(p, " Thermal event interrupts\n"); #endif #ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_puts(p, " Threshold APIC interrupts\n"); #endif #ifdef CONFIG_X86_MCE_AMD seq_printf(p, "%*s: ", prec, "DFR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); seq_puts(p, " Deferred Error APIC interrupts\n"); #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); seq_puts(p, " Machine check exceptions\n"); seq_printf(p, "%*s: ", prec, "MCP"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_puts(p, " Machine check polls\n"); #endif #ifdef CONFIG_X86_HV_CALLBACK_VECTOR if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); seq_puts(p, " Hypervisor callback interrupts\n"); } #endif #if IS_ENABLED(CONFIG_HYPERV) if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HRE"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_reenlightenment_count); seq_puts(p, " Hyper-V reenlightenment interrupts\n"); } if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HVS"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->hyperv_stimer0_count); seq_puts(p, " Hyper-V stimer0 interrupts\n"); } #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif #ifdef CONFIG_HAVE_KVM seq_printf(p, "%*s: ", prec, "PIN"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); seq_puts(p, " Posted-interrupt notification event\n"); seq_printf(p, "%*s: ", prec, "NPI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_nested_ipis); seq_puts(p, " Nested posted-interrupt event\n"); seq_printf(p, "%*s: ", prec, "PIW"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif return 0; } /* * /proc/stat helpers */ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; sum += irq_stats(cpu)->icr_read_retry_count; if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; #endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; #endif #ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif #ifdef CONFIG_X86_HV_CALLBACK_VECTOR sum += irq_stats(cpu)->irq_hv_callback_count; #endif #if IS_ENABLED(CONFIG_HYPERV) sum += irq_stats(cpu)->irq_hv_reenlightenment_count; sum += irq_stats(cpu)->hyperv_stimer0_count; #endif #ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif return sum; } u64 arch_irq_stat(void) { u64 sum = atomic_read(&irq_err_count); return sum; } static __always_inline void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { if (IS_ENABLED(CONFIG_X86_64)) generic_handle_irq_desc(desc); else __handle_irq(desc, regs); } /* * common_interrupt() handles all normal device IRQ's (the special SMP * cross-CPU interrupts have their own entry points). */ DEFINE_IDTENTRY_IRQ(common_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; /* entry code tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { apic_eoi(); if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } set_irq_regs(old_regs); } #ifdef CONFIG_X86_LOCAL_APIC /* Function pointer for generic interrupt vector handling */ void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); apic_eoi(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); set_irq_regs(old_regs); } #endif #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) { if (handler) kvm_posted_intr_wakeup_handler = handler; else { kvm_posted_intr_wakeup_handler = dummy_handler; synchronize_rcu(); } } EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); /* * Handler for POSTED_INTERRUPT_VECTOR. */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { apic_eoi(); inc_irq_stat(kvm_posted_intr_ipis); } /* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { apic_eoi(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); } /* * Handler for POSTED_INTERRUPT_NESTED_VECTOR. */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { apic_eoi(); inc_irq_stat(kvm_posted_intr_nested_ipis); } #endif #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { unsigned int irr, vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; irq_migrate_all_off_this_cpu(); /* * We can remove mdelay() and then send spurious interrupts to * new cpu targets for all the irqs that were handled previously by * this cpu. While it works, I have seen spurious interrupt messages * (nothing wrong but still...). * * So for now, retain mdelay(1) and check the IRR and then send those * interrupts to new targets as this cpu is already offlined... */ mdelay(1); /* * We can walk the vector array of this cpu without holding * vector_lock because the cpu is already marked !online, so * nothing else will touch it. */ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); if (irr & (1 << (vector % 32))) { desc = __this_cpu_read(vector_irq[vector]); raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); if (chip->irq_retrigger) { chip->irq_retrigger(data); __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); } raw_spin_unlock(&desc->lock); } if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } #endif #ifdef CONFIG_X86_THERMAL_VECTOR static void smp_thermal_vector(void) { if (x86_thermal_enabled()) intel_thermal_interrupt(); else pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); } DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) { trace_thermal_apic_entry(THERMAL_APIC_VECTOR); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); apic_eoi(); } #endif |
1717 1718 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 | // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_rx_stats(struct net_device *dev, int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); dstats->rx_packets++; dstats->rx_bytes += len; u64_stats_update_end(&dstats->syncp); } static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static void vrf_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { int i; for_each_possible_cpu(i) { const struct pcpu_dstats *dstats; u64 tbytes, tpkts, tdrops, rbytes, rpkts; unsigned int start; dstats = per_cpu_ptr(dev->dstats, i); do { start = u64_stats_fetch_begin(&dstats->syncp); tbytes = dstats->tx_bytes; tpkts = dstats->tx_packets; tdrops = dstats->tx_drops; rbytes = dstats->rx_bytes; rpkts = dstats->rx_packets; } while (u64_stats_fetch_retry(&dstats->syncp, start)); stats->tx_bytes += tbytes; stats->tx_packets += tpkts; stats->tx_dropped += tdrops; stats->rx_bytes += rbytes; stats->rx_packets += rpkts; } } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc(sizeof(*me), flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); qdisc = rcu_access_pointer(txq->qdisc); return !qdisc->enqueue; } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) vrf_rx_stats(dev, len); else this_cpu_inc(dev->dstats->rx_drops); return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_tos = RT_TOS(ip4h->tos); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { int len = skb->len; netdev_tx_t ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); dstats->tx_packets++; dstats->tx_bytes += len; u64_stats_update_end(&dstats->syncp); } else { this_cpu_inc(dev->dstats->tx_drops); } return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6; rcu_read_lock(); rt6 = rcu_dereference(vrf->rt6); if (likely(rt6)) { dst = &rt6->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rt6) { dst = &rt6->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; rcu_assign_pointer(vrf->rt6, rt6); rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth; rcu_read_lock(); rth = rcu_dereference(vrf->rth); if (likely(rth)) { dst = &rth->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rth) { dst = &rth->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; rcu_assign_pointer(vrf->rth, rth); return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = vrf_get_stats64, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } vrf_rx_stats(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); /* fib_nl_{new,del}rule handling looks for net from skb->sk */ skb->sk = dev_net(dev)->rtnl; if (add_it) { err = fib_nl_newrule(skb, nlh, NULL); if (err == -EEXIST) err = 0; } else { err = fib_nl_delrule(skb, nlh, NULL); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->features |= NETIF_F_LLTX; /* don't allow vrf devices to change network namespaces. */ dev->features |= NETIF_F_NETNS_LOCAL; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EADDRNOTAVAIL; } } return 0; } static void vrf_dellink(struct net_device *dev, struct list_head *head) { struct net_device *port_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); vrf_map_unregister_dev(dev); unregister_netdevice_queue(dev, head); } static int vrf_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; if (!data || !data[IFLA_VRF_TABLE]) { NL_SET_ERR_MSG(extack, "VRF table id is missing"); return -EINVAL; } vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); if (vrf->tb_id == RT_TABLE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], "Invalid VRF table id"); return -EINVAL; } dev->priv_flags |= IFF_L3MDEV_MASTER; err = register_netdevice(dev); if (err) goto out; /* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet. */ vrf->ifindex = dev->ifindex; err = vrf_map_register_dev(dev, extack); if (err) { unregister_netdevice(dev); goto out; } net = dev_net(dev); nn_vrf = net_generic(net, vrf_net_id); add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } *add_fib_rules = false; } out: return err; } static size_t vrf_nl_getsize(const struct net_device *dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ } static int vrf_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); } static size_t vrf_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ } static int vrf_fill_slave_info(struct sk_buff *skb, const struct net_device *vrf_dev, const struct net_device *slave_dev) { struct net_vrf *vrf = netdev_priv(vrf_dev); if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) return -EMSGSIZE; return 0; } static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { [IFLA_VRF_TABLE] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vrf_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct net_vrf), .get_size = vrf_nl_getsize, .policy = vrf_nl_policy, .validate = vrf_validate, .fill_info = vrf_fillinfo, .get_slave_size = vrf_get_slave_size, .fill_slave_info = vrf_fill_slave_info, .newlink = vrf_newlink, .dellink = vrf_dellink, .setup = vrf_setup, .maxtype = IFLA_VRF_MAX, }; static int vrf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* only care about unregister events to drop slave references */ if (event == NETDEV_UNREGISTER) { struct net_device *vrf_dev; if (!netif_is_l3_slave(dev)) goto out; vrf_dev = netdev_master_upper_dev_get(dev); vrf_del_slave(vrf_dev, dev); } out: return NOTIFY_DONE; } static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; static int vrf_map_init(struct vrf_map *vmap) { spin_lock_init(&vmap->vmap_lock); hash_init(vmap->ht); vmap->strict_mode = false; return 0; } #ifdef CONFIG_SYSCTL static bool vrf_strict_mode(struct vrf_map *vmap) { bool strict_mode; vrf_map_lock(vmap); strict_mode = vmap->strict_mode; vrf_map_unlock(vmap); return strict_mode; } static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) { bool *cur_mode; int res = 0; vrf_map_lock(vmap); cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock; if (*cur_mode) { /* disable strict mode */ *cur_mode = false; } else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables. */ res = -EBUSY; goto unlock; } /* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table. */ *cur_mode = true; } unlock: vrf_map_unlock(vmap); return res; } static int vrf_shared_table_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->extra1; struct vrf_map *vmap = netns_vrf_map(net); int proc_strict_mode = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_strict_mode, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_strict_mode = vrf_strict_mode(vmap); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); return ret; } static const struct ctl_table vrf_table[] = { { .procname = "strict_mode", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = vrf_shared_table_handler, /* set by the vrf_netns_init */ .extra1 = NULL, }, }; static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { struct ctl_table *table; table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); if (!table) return -ENOMEM; /* init the extra1 parameter with the reference to current netns */ table[0].extra1 = net; nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, ARRAY_SIZE(vrf_table)); if (!nn_vrf->ctl_hdr) { kfree(table); return -ENOMEM; } return 0; } static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); kfree(table); } #else static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { return 0; } static void vrf_netns_exit_sysctl(struct net *net) { } #endif /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); nn_vrf->add_fib_rules = true; vrf_map_init(&nn_vrf->vmap); return vrf_netns_init_sysctl(net, nn_vrf); } static void __net_exit vrf_netns_exit(struct net *net) { vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .exit = vrf_netns_exit, .id = &vrf_net_id, .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) { int rc; register_netdevice_notifier(&vrf_notifier_block); rc = register_pernet_subsys(&vrf_net_ops); if (rc < 0) goto error; rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); if (rc < 0) goto unreg_pernet; rc = rtnl_link_register(&vrf_link_ops); if (rc < 0) goto table_lookup_unreg; return 0; table_lookup_unreg: l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); unreg_pernet: unregister_pernet_subsys(&vrf_net_ops); error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; } module_init(vrf_init_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); MODULE_VERSION(DRV_VERSION); |
18 17 16 15 18 14 14 1 8 7 6 6 6 4 4 3 4 47 12 12 11 11 50 51 10 51 51 51 1 51 51 10 9 9 49 49 1 49 49 50 50 50 50 50 1 1 1 1 1 51 8 8 8 8 2 2 1 1 3 2 2 2 4 3 4 3 1 3 4 2 2 1 4 13 13 12 11 2 9 2 13 55 1 23 53 58 2 56 58 58 3 55 1 54 54 53 13 40 55 57 44 11 46 24 19 46 1 1 10 10 3 39 4 39 39 40 40 40 5 39 39 40 7 3 2 6 5 4 7 27 26 25 24 26 23 22 22 21 20 20 1 6 6 6 2 1 64 19 45 12 3 12 1 4 2 1 5 4 10 2 5 9 5 4 2 1 5 5 12 2 10 13 13 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { u32 ufd = attr->target_fd; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); fdput(f); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { u32 ufd = attr->target_fd; struct bpf_prog *prog; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto put_map; } if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); put_prog: bpf_prog_put(prog); put_map: fdput(f); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk; int err = 0; spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } return 0; } static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) { struct bpf_prog **pprog; int ret; ret = sock_map_prog_lookup(map, &pprog, which); if (ret) return ret; if (old) return psock_replace_prog(pprog, prog, old); psock_set_prog(pprog, prog); return 0; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; struct fd f; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; fdput(f); return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); saved_close = READ_ONCE(sk->sk_prot)->close; } else { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init); |
14561 1343 1166 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * kref.h - library routines for handling generic reference counted objects * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Corp. * * based on kobject.h which was: * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (C) 2002-2003 Open Source Development Labs */ #ifndef _KREF_H_ #define _KREF_H_ #include <linux/spinlock.h> #include <linux/refcount.h> struct kref { refcount_t refcount; }; #define KREF_INIT(n) { .refcount = REFCOUNT_INIT(n), } /** * kref_init - initialize object. * @kref: object in question. */ static inline void kref_init(struct kref *kref) { refcount_set(&kref->refcount, 1); } static inline unsigned int kref_read(const struct kref *kref) { return refcount_read(&kref->refcount); } /** * kref_get - increment refcount for object. * @kref: object. */ static inline void kref_get(struct kref *kref) { refcount_inc(&kref->refcount); } /** * kref_put - decrement refcount for object. * @kref: object. * @release: pointer to the function that will clean up the object when the * last reference to the object is released. * This pointer is required, and it is not acceptable to pass kfree * in as this function. * * Decrement the refcount, and if 0, call release(). * Return 1 if the object was removed, otherwise return 0. Beware, if this * function returns 0, you still can not count on the kref from remaining in * memory. Only use the return value if you want to see if the kref is now * gone, not present. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { if (refcount_dec_and_test(&kref->refcount)) { release(kref); return 1; } return 0; } static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *lock) { if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) { if (refcount_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } /** * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * * Return non-zero if the increment succeeded. Otherwise return 0. * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. * Operations on such objects require at least a read lock around * lookup + kref_get, and a write lock around kref_put + remove from lookup * structure. Furthermore, RCU implementations become extremely tricky. * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { return refcount_inc_not_zero(&kref->refcount); } #endif /* _KREF_H_ */ |
3495 3493 3591 378 957 15 306 955 15 305 955 58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM net #if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NET_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/tracepoint.h> TRACE_EVENT(net_dev_start_xmit, TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), TP_ARGS(skb, dev), TP_STRUCT__entry( __string( name, dev->name ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( int, network_offset ) __field( bool, transport_offset_valid) __field( int, transport_offset) __field( u8, tx_flags ) __field( u16, gso_size ) __field( u16, gso_segs ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, dev->name); __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->network_offset = skb_network_offset(skb); __entry->transport_offset_valid = skb_transport_header_was_set(skb); __entry->transport_offset = skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : 0; __entry->tx_flags = skb_shinfo(skb)->tx_flags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, __entry->data_len, __entry->network_offset, __entry->transport_offset_valid, __entry->transport_offset, __entry->tx_flags, __entry->gso_size, __entry->gso_segs, __entry->gso_type) ); TRACE_EVENT(net_dev_xmit, TP_PROTO(struct sk_buff *skb, int rc, struct net_device *dev, unsigned int skb_len), TP_ARGS(skb, rc, dev, skb_len), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __field( int, rc ) __string( name, dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; __assign_str(name, dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u rc=%d", __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); TRACE_EVENT(net_dev_xmit_timeout, TP_PROTO(struct net_device *dev, int queue_index), TP_ARGS(dev, queue_index), TP_STRUCT__entry( __string( name, dev->name ) __string( driver, netdev_drivername(dev)) __field( int, queue_index ) ), TP_fast_assign( __assign_str(name, dev->name); __assign_str(driver, netdev_drivername(dev)); __entry->queue_index = queue_index; ), TP_printk("dev=%s driver=%s queue=%d", __get_str(name), __get_str(driver), __entry->queue_index) ); DECLARE_EVENT_CLASS(net_dev_template, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __string( name, skb->dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; __assign_str(name, skb->dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u", __get_str(name), __entry->skbaddr, __entry->len) ) DEFINE_EVENT(net_dev_template, net_dev_queue, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_receive_skb, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_rx, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __string( name, skb->dev->name ) __field( unsigned int, napi_id ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( u32, hash ) __field( bool, l4_hash ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( unsigned int, truesize ) __field( bool, mac_header_valid) __field( int, mac_header ) __field( unsigned char, nr_frags ) __field( u16, gso_size ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, skb->dev->name); #ifdef CONFIG_NET_RX_BUSY_POLL __entry->napi_id = skb->napi_id; #else __entry->napi_id = 0; #endif __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->hash = skb->hash; __entry->l4_hash = skb->l4_hash; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->truesize = skb->truesize; __entry->mac_header_valid = skb_mac_header_was_set(skb); __entry->mac_header = skb_mac_header(skb) - skb->data; __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->hash, __entry->l4_hash, __entry->len, __entry->data_len, __entry->truesize, __entry->mac_header_valid, __entry->mac_header, __entry->nr_frags, __entry->gso_size, __entry->gso_type) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field(int, ret) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), TP_ARGS(ret) ); #endif /* _TRACE_NET_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1167 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TEXT_PATCHING_H #define _ASM_X86_TEXT_PATCHING_H #include <linux/types.h> #include <linux/stddef.h> #include <asm/ptrace.h> /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ #define POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. * Side-effect: any interrupt handler running between save and restore will have * the ability to write to read-only pages. * * Warning: * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * On the local CPU you need to be protected against NMI or MCE handlers seeing * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC #define RET_INSN_SIZE 1 #define RET_INSN_OPCODE 0xC3 #define CALL_INSN_SIZE 5 #define CALL_INSN_OPCODE 0xE8 #define JMP32_INSN_SIZE 5 #define JMP32_INSN_OPCODE 0xE9 #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB #define DISP32_SIZE 4 static __always_inline int text_opcode_size(u8 opcode) { int size = 0; #define __CASE(insn) \ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break switch(opcode) { __CASE(INT3); __CASE(RET); __CASE(CALL); __CASE(JMP32); __CASE(JMP8); } #undef __CASE return size; } union text_poke_insn { u8 text[POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; } __attribute__((packed)); }; static __always_inline void __text_gen_insn(void *buf, u8 opcode, const void *addr, const void *dest, int size) { union text_poke_insn *insn = buf; BUG_ON(size < text_opcode_size(opcode)); /* * Hide the addresses to avoid the compiler folding in constants when * referencing code, these can mess up annotations like * ANNOTATE_NOENDBR. */ OPTIMIZER_HIDE_VAR(insn); OPTIMIZER_HIDE_VAR(addr); OPTIMIZER_HIDE_VAR(dest); insn->opcode = opcode; if (size > 1) { insn->disp = (long)dest - (long)(addr + size); if (size == 2) { /* * Ensure that for JMP8 the displacement * actually fits the signed byte. */ BUG_ON((insn->disp >> 31) != (insn->disp >> 7)); } } } static __always_inline void *text_gen_insn(u8 opcode, const void *addr, const void *dest) { static union text_poke_insn insn; /* per instance */ __text_gen_insn(&insn, opcode, addr, dest, text_opcode_size(opcode)); return &insn.text; } extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; #ifndef CONFIG_UML_X86 static __always_inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* * The int3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of * this gap. See the idtentry macro's create_gap option. * * Similarly entry_32.S will have a gap on the stack for (any) hardware * exception and pt_regs; see FIXUP_FRAME. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; } static __always_inline unsigned long int3_emulate_pop(struct pt_regs *regs) { unsigned long val = *(unsigned long *)regs->sp; regs->sp += sizeof(unsigned long); return val; } static __always_inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } static __always_inline void int3_emulate_ret(struct pt_regs *regs) { unsigned long ip = int3_emulate_pop(regs); int3_emulate_jmp(regs, ip); } static __always_inline void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp) { static const unsigned long jcc_mask[6] = { [0] = X86_EFLAGS_OF, [1] = X86_EFLAGS_CF, [2] = X86_EFLAGS_ZF, [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, [4] = X86_EFLAGS_SF, [5] = X86_EFLAGS_PF, }; bool invert = cc & 1; bool match; if (cc < 0xc) { match = regs->flags & jcc_mask[cc >> 1]; } else { match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); if (cc >= 0xe) match = match || (regs->flags & X86_EFLAGS_ZF); } if ((match && !invert) || (!match && invert)) ip += disp; int3_emulate_jmp(regs, ip); } #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */ |
1 131 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RATELIMIT_H #define _LINUX_RATELIMIT_H #include <linux/ratelimit_types.h> #include <linux/sched.h> #include <linux/spinlock.h> static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { memset(rs, 0, sizeof(*rs)); raw_spin_lock_init(&rs->lock); rs->interval = interval; rs->burst = burst; } static inline void ratelimit_default_init(struct ratelimit_state *rs) { return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); } static inline void ratelimit_state_exit(struct ratelimit_state *rs) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) return; if (rs->missed) { pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, rs->missed); rs->missed = 0; } } static inline void ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags) { rs->flags = flags; } extern struct ratelimit_state printk_ratelimit_state; #ifdef CONFIG_PRINTK #define WARN_ON_RATELIMIT(condition, state) ({ \ bool __rtn_cond = !!(condition); \ WARN_ON(__rtn_cond && __ratelimit(state)); \ __rtn_cond; \ }) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ int rtn = !!(condition); \ \ if (unlikely(rtn && __ratelimit(&_rs))) \ WARN(rtn, format, ##__VA_ARGS__); \ \ rtn; \ }) #else #define WARN_ON_RATELIMIT(condition, state) \ WARN_ON(condition) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ int rtn = WARN(condition, format, ##__VA_ARGS__); \ rtn; \ }) #endif #endif /* _LINUX_RATELIMIT_H */ |
8996 1167 1186 38 250 2136 2136 2136 2136 4428 4429 5551 4737 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/tomoyo.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/lsm_hooks.h> #include <uapi/linux/lsm.h> #include "common.h" /** * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread. * * Returns pointer to "struct tomoyo_domain_info" for current thread. */ struct tomoyo_domain_info *tomoyo_domain(void) { struct tomoyo_task *s = tomoyo_task(current); if (s->old_domain_info && !current->in_execve) { atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } return s->domain_info; } /** * tomoyo_cred_prepare - Target for security_prepare_creds(). * * @new: Pointer to "struct cred". * @old: Pointer to "struct cred". * @gfp: Memory allocation flags. * * Returns 0. */ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { /* Restore old_domain_info saved by previous execve() request. */ struct tomoyo_task *s = tomoyo_task(current); if (s->old_domain_info && !current->in_execve) { atomic_dec(&s->domain_info->users); s->domain_info = s->old_domain_info; s->old_domain_info = NULL; } return 0; } /** * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds(). * * @bprm: Pointer to "struct linux_binprm". */ static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm) { /* Clear old_domain_info saved by execve() request. */ struct tomoyo_task *s = tomoyo_task(current); atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER /** * tomoyo_bprm_creds_for_exec - Target for security_bprm_creds_for_exec(). * * @bprm: Pointer to "struct linux_binprm". * * Returns 0. */ static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm) { /* * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested * for the first time. */ if (!tomoyo_policy_loaded) tomoyo_load_policy(bprm->filename); return 0; } #endif /** * tomoyo_bprm_check_security - Target for security_bprm_check(). * * @bprm: Pointer to "struct linux_binprm". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_bprm_check_security(struct linux_binprm *bprm) { struct tomoyo_task *s = tomoyo_task(current); /* * Execute permission is checked against pathname passed to execve() * using current domain. */ if (!s->old_domain_info) { const int idx = tomoyo_read_lock(); const int err = tomoyo_find_next_domain(bprm); tomoyo_read_unlock(idx); return err; } /* * Read permission is checked against interpreters using next domain. */ return tomoyo_check_open_permission(s->domain_info, &bprm->file->f_path, O_RDONLY); } /** * tomoyo_inode_getattr - Target for security_inode_getattr(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_inode_getattr(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_GETATTR, path, NULL); } /** * tomoyo_path_truncate - Target for security_path_truncate(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_truncate(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL); } /** * tomoyo_file_truncate - Target for security_file_truncate(). * * @file: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_truncate(struct file *file) { return tomoyo_path_truncate(&file->f_path); } /** * tomoyo_path_unlink - Target for security_path_unlink(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL); } /** * tomoyo_path_mkdir - Target for security_path_mkdir(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @mode: DAC permission mode. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path, mode & S_IALLUGO); } /** * tomoyo_path_rmdir - Target for security_path_rmdir(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL); } /** * tomoyo_path_symlink - Target for security_path_symlink(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @old_name: Symlink's content. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry, const char *old_name) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name); } /** * tomoyo_path_mknod - Target for security_path_mknod(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @mode: DAC permission mode. * @dev: Device attributes. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry, umode_t mode, unsigned int dev) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; int type = TOMOYO_TYPE_CREATE; const unsigned int perm = mode & S_IALLUGO; switch (mode & S_IFMT) { case S_IFCHR: type = TOMOYO_TYPE_MKCHAR; break; case S_IFBLK: type = TOMOYO_TYPE_MKBLOCK; break; default: goto no_dev; } return tomoyo_mkdev_perm(type, &path, perm, dev); no_dev: switch (mode & S_IFMT) { case S_IFIFO: type = TOMOYO_TYPE_MKFIFO; break; case S_IFSOCK: type = TOMOYO_TYPE_MKSOCK; break; } return tomoyo_path_number_perm(type, &path, perm); } /** * tomoyo_path_link - Target for security_path_link(). * * @old_dentry: Pointer to "struct dentry". * @new_dir: Pointer to "struct path". * @new_dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry }; struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry }; return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2); } /** * tomoyo_path_rename - Target for security_path_rename(). * * @old_parent: Pointer to "struct path". * @old_dentry: Pointer to "struct dentry". * @new_parent: Pointer to "struct path". * @new_dentry: Pointer to "struct dentry". * @flags: Rename options. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_rename(const struct path *old_parent, struct dentry *old_dentry, const struct path *new_parent, struct dentry *new_dentry, const unsigned int flags) { struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry }; struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry }; if (flags & RENAME_EXCHANGE) { const int err = tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path2, &path1); if (err) return err; } return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2); } /** * tomoyo_file_fcntl - Target for security_file_fcntl(). * * @file: Pointer to "struct file". * @cmd: Command for fcntl(). * @arg: Argument for @cmd. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { if (!(cmd == F_SETFL && ((arg ^ file->f_flags) & O_APPEND))) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &file->f_path, O_WRONLY | (arg & O_APPEND)); } /** * tomoyo_file_open - Target for security_file_open(). * * @f: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_open(struct file *f) { /* Don't check read permission here if called from execve(). */ if (current->in_execve) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, f->f_flags); } /** * tomoyo_file_ioctl - Target for security_file_ioctl(). * * @file: Pointer to "struct file". * @cmd: Command for ioctl(). * @arg: Argument for @cmd. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return tomoyo_path_number_perm(TOMOYO_TYPE_IOCTL, &file->f_path, cmd); } /** * tomoyo_path_chmod - Target for security_path_chmod(). * * @path: Pointer to "struct path". * @mode: DAC permission mode. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chmod(const struct path *path, umode_t mode) { return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path, mode & S_IALLUGO); } /** * tomoyo_path_chown - Target for security_path_chown(). * * @path: Pointer to "struct path". * @uid: Owner ID. * @gid: Group ID. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { int error = 0; if (uid_valid(uid)) error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path, from_kuid(&init_user_ns, uid)); if (!error && gid_valid(gid)) error = tomoyo_path_number_perm(TOMOYO_TYPE_CHGRP, path, from_kgid(&init_user_ns, gid)); return error; } /** * tomoyo_path_chroot - Target for security_path_chroot(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chroot(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL); } /** * tomoyo_sb_mount - Target for security_sb_mount(). * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return tomoyo_mount_permission(dev_name, path, type, flags, data); } /** * tomoyo_sb_umount - Target for security_sb_umount(). * * @mnt: Pointer to "struct vfsmount". * @flags: Unmount options. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_umount(struct vfsmount *mnt, int flags) { struct path path = { .mnt = mnt, .dentry = mnt->mnt_root }; return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL); } /** * tomoyo_sb_pivotroot - Target for security_sb_pivotroot(). * * @old_path: Pointer to "struct path". * @new_path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path); } /** * tomoyo_socket_listen - Check permission for listen(). * * @sock: Pointer to "struct socket". * @backlog: Backlog parameter. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_listen(struct socket *sock, int backlog) { return tomoyo_socket_listen_permission(sock); } /** * tomoyo_socket_connect - Check permission for connect(). * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_connect(struct socket *sock, struct sockaddr *addr, int addr_len) { return tomoyo_socket_connect_permission(sock, addr, addr_len); } /** * tomoyo_socket_bind - Check permission for bind(). * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { return tomoyo_socket_bind_permission(sock, addr, addr_len); } /** * tomoyo_socket_sendmsg - Check permission for sendmsg(). * * @sock: Pointer to "struct socket". * @msg: Pointer to "struct msghdr". * @size: Size of message. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return tomoyo_socket_sendmsg_permission(sock, msg, size); } struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { .lbs_task = sizeof(struct tomoyo_task), }; /** * tomoyo_task_alloc - Target for security_task_alloc(). * * @task: Pointer to "struct task_struct". * @clone_flags: clone() flags. * * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); new->domain_info = old->domain_info; atomic_inc(&new->domain_info->users); new->old_domain_info = NULL; return 0; } /** * tomoyo_task_free - Target for security_task_free(). * * @task: Pointer to "struct task_struct". */ static void tomoyo_task_free(struct task_struct *task) { struct tomoyo_task *s = tomoyo_task(task); if (s->domain_info) { atomic_dec(&s->domain_info->users); s->domain_info = NULL; } if (s->old_domain_info) { atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } } static const struct lsm_id tomoyo_lsmid = { .name = "tomoyo", .id = LSM_ID_TOMOYO, }; /* * tomoyo_security_ops is a "struct security_operations" which is used for * registering TOMOYO. */ static struct security_hook_list tomoyo_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare), LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds), LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc), LSM_HOOK_INIT(task_free, tomoyo_task_free), #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec), #endif LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security), LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl), LSM_HOOK_INIT(file_open, tomoyo_file_open), LSM_HOOK_INIT(file_truncate, tomoyo_file_truncate), LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate), LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink), LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir), LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir), LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink), LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod), LSM_HOOK_INIT(path_link, tomoyo_path_link), LSM_HOOK_INIT(path_rename, tomoyo_path_rename), LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr), LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl), LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod), LSM_HOOK_INIT(path_chown, tomoyo_path_chown), LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot), LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount), LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount), LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot), LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind), LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect), LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen), LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg), }; /* Lock for GC. */ DEFINE_SRCU(tomoyo_ss); int tomoyo_enabled __ro_after_init = 1; /** * tomoyo_init - Register TOMOYO Linux as a LSM module. * * Returns 0. */ static int __init tomoyo_init(void) { struct tomoyo_task *s = tomoyo_task(current); /* register ourselves with the security framework */ security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), &tomoyo_lsmid); pr_info("TOMOYO Linux initialized\n"); s->domain_info = &tomoyo_kernel_domain; atomic_inc(&tomoyo_kernel_domain.users); s->old_domain_info = NULL; tomoyo_mm_init(); return 0; } DEFINE_LSM(tomoyo) = { .name = "tomoyo", .enabled = &tomoyo_enabled, .flags = LSM_FLAG_LEGACY_MAJOR, .blobs = &tomoyo_blob_sizes, .init = tomoyo_init, }; |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux I2C core * * Copyright (C) 1995-99 Simon G. Vogl * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> * Mux support by Rodolfo Giometti <giometti@enneenne.com> and * Michael Lawnick <michael.lawnick.ext@nsn.com> * * Copyright (C) 2013-2017 Wolfram Sang <wsa@kernel.org> */ #define pr_fmt(fmt) "i2c-core: " fmt #include <dt-bindings/i2c/i2c.h> #include <linux/acpi.h> #include <linux/clk/clk-conf.h> #include <linux/completion.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/gpio/consumer.h> #include <linux/i2c.h> #include <linux/i2c-smbus.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/irqflags.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of_device.h> #include <linux/of.h> #include <linux/of_irq.h> #include <linux/pinctrl/consumer.h> #include <linux/pinctrl/devinfo.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/property.h> #include <linux/rwsem.h> #include <linux/slab.h> #include "i2c-core.h" #define CREATE_TRACE_POINTS #include <trace/events/i2c.h> #define I2C_ADDR_OFFSET_TEN_BIT 0xa000 #define I2C_ADDR_OFFSET_SLAVE 0x1000 #define I2C_ADDR_7BITS_MAX 0x77 #define I2C_ADDR_7BITS_COUNT (I2C_ADDR_7BITS_MAX + 1) #define I2C_ADDR_DEVICE_ID 0x7c /* * core_lock protects i2c_adapter_idr, and guarantees that device detection, * deletion of detected devices are serialized */ static DEFINE_MUTEX(core_lock); static DEFINE_IDR(i2c_adapter_idr); static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver); static DEFINE_STATIC_KEY_FALSE(i2c_trace_msg_key); static bool is_registered; static struct dentry *i2c_debugfs_root; int i2c_transfer_trace_reg(void) { static_branch_inc(&i2c_trace_msg_key); return 0; } void i2c_transfer_trace_unreg(void) { static_branch_dec(&i2c_trace_msg_key); } const char *i2c_freq_mode_string(u32 bus_freq_hz) { switch (bus_freq_hz) { case I2C_MAX_STANDARD_MODE_FREQ: return "Standard Mode (100 kHz)"; case I2C_MAX_FAST_MODE_FREQ: return "Fast Mode (400 kHz)"; case I2C_MAX_FAST_MODE_PLUS_FREQ: return "Fast Mode Plus (1.0 MHz)"; case I2C_MAX_TURBO_MODE_FREQ: return "Turbo Mode (1.4 MHz)"; case I2C_MAX_HIGH_SPEED_MODE_FREQ: return "High Speed Mode (3.4 MHz)"; case I2C_MAX_ULTRA_FAST_MODE_FREQ: return "Ultra Fast Mode (5.0 MHz)"; default: return "Unknown Mode"; } } EXPORT_SYMBOL_GPL(i2c_freq_mode_string); const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id, const struct i2c_client *client) { if (!(id && client)) return NULL; while (id->name[0]) { if (strcmp(client->name, id->name) == 0) return id; id++; } return NULL; } EXPORT_SYMBOL_GPL(i2c_match_id); const void *i2c_get_match_data(const struct i2c_client *client) { struct i2c_driver *driver = to_i2c_driver(client->dev.driver); const struct i2c_device_id *match; const void *data; data = device_get_match_data(&client->dev); if (!data) { match = i2c_match_id(driver->id_table, client); if (!match) return NULL; data = (const void *)match->driver_data; } return data; } EXPORT_SYMBOL(i2c_get_match_data); static int i2c_device_match(struct device *dev, struct device_driver *drv) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_driver *driver; /* Attempt an OF style match */ if (i2c_of_match_device(drv->of_match_table, client)) return 1; /* Then ACPI style match */ if (acpi_driver_match_device(dev, drv)) return 1; driver = to_i2c_driver(drv); /* Finally an I2C match */ if (i2c_match_id(driver->id_table, client)) return 1; return 0; } static int i2c_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct i2c_client *client = to_i2c_client(dev); int rc; rc = of_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; rc = acpi_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; return add_uevent_var(env, "MODALIAS=%s%s", I2C_MODULE_PREFIX, client->name); } /* i2c bus recovery routines */ static int get_scl_gpio_value(struct i2c_adapter *adap) { return gpiod_get_value_cansleep(adap->bus_recovery_info->scl_gpiod); } static void set_scl_gpio_value(struct i2c_adapter *adap, int val) { gpiod_set_value_cansleep(adap->bus_recovery_info->scl_gpiod, val); } static int get_sda_gpio_value(struct i2c_adapter *adap) { return gpiod_get_value_cansleep(adap->bus_recovery_info->sda_gpiod); } static void set_sda_gpio_value(struct i2c_adapter *adap, int val) { gpiod_set_value_cansleep(adap->bus_recovery_info->sda_gpiod, val); } static int i2c_generic_bus_free(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; int ret = -EOPNOTSUPP; if (bri->get_bus_free) ret = bri->get_bus_free(adap); else if (bri->get_sda) ret = bri->get_sda(adap); if (ret < 0) return ret; return ret ? 0 : -EBUSY; } /* * We are generating clock pulses. ndelay() determines durating of clk pulses. * We will generate clock with rate 100 KHz and so duration of both clock levels * is: delay in ns = (10^6 / 100) / 2 */ #define RECOVERY_NDELAY 5000 #define RECOVERY_CLK_CNT 9 int i2c_generic_scl_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; int i = 0, scl = 1, ret = 0; if (bri->prepare_recovery) bri->prepare_recovery(adap); if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_gpio); /* * If we can set SDA, we will always create a STOP to ensure additional * pulses will do no harm. This is achieved by letting SDA follow SCL * half a cycle later. Check the 'incomplete_write_byte' fault injector * for details. Note that we must honour tsu:sto, 4us, but lets use 5us * here for simplicity. */ bri->set_scl(adap, scl); ndelay(RECOVERY_NDELAY); if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); /* * By this time SCL is high, as we need to give 9 falling-rising edges */ while (i++ < RECOVERY_CLK_CNT * 2) { if (scl) { /* SCL shouldn't be low here */ if (!bri->get_scl(adap)) { dev_err(&adap->dev, "SCL is stuck low, exit recovery\n"); ret = -EBUSY; break; } } scl = !scl; bri->set_scl(adap, scl); /* Creating STOP again, see above */ if (scl) { /* Honour minimum tsu:sto */ ndelay(RECOVERY_NDELAY); } else { /* Honour minimum tf and thd:dat */ ndelay(RECOVERY_NDELAY / 2); } if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); if (scl) { ret = i2c_generic_bus_free(adap); if (ret == 0) break; } } /* If we can't check bus status, assume recovery worked */ if (ret == -EOPNOTSUPP) ret = 0; if (bri->unprepare_recovery) bri->unprepare_recovery(adap); if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_default); return ret; } EXPORT_SYMBOL_GPL(i2c_generic_scl_recovery); int i2c_recover_bus(struct i2c_adapter *adap) { if (!adap->bus_recovery_info) return -EBUSY; dev_dbg(&adap->dev, "Trying i2c bus recovery\n"); return adap->bus_recovery_info->recover_bus(adap); } EXPORT_SYMBOL_GPL(i2c_recover_bus); static void i2c_gpio_init_pinctrl_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; struct device *dev = &adap->dev; struct pinctrl *p = bri->pinctrl ?: dev_pinctrl(dev->parent); bri->pinctrl = p; /* * we can't change states without pinctrl, so remove the states if * populated */ if (!p) { bri->pins_default = NULL; bri->pins_gpio = NULL; return; } if (!bri->pins_default) { bri->pins_default = pinctrl_lookup_state(p, PINCTRL_STATE_DEFAULT); if (IS_ERR(bri->pins_default)) { dev_dbg(dev, PINCTRL_STATE_DEFAULT " state not found for GPIO recovery\n"); bri->pins_default = NULL; } } if (!bri->pins_gpio) { bri->pins_gpio = pinctrl_lookup_state(p, "gpio"); if (IS_ERR(bri->pins_gpio)) bri->pins_gpio = pinctrl_lookup_state(p, "recovery"); if (IS_ERR(bri->pins_gpio)) { dev_dbg(dev, "no gpio or recovery state found for GPIO recovery\n"); bri->pins_gpio = NULL; } } /* for pinctrl state changes, we need all the information */ if (bri->pins_default && bri->pins_gpio) { dev_info(dev, "using pinctrl states for GPIO recovery"); } else { bri->pinctrl = NULL; bri->pins_default = NULL; bri->pins_gpio = NULL; } } static int i2c_gpio_init_generic_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; struct device *dev = &adap->dev; struct gpio_desc *gpiod; int ret = 0; /* * don't touch the recovery information if the driver is not using * generic SCL recovery */ if (bri->recover_bus && bri->recover_bus != i2c_generic_scl_recovery) return 0; /* * pins might be taken as GPIO, so we should inform pinctrl about * this and move the state to GPIO */ if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_gpio); /* * if there is incomplete or no recovery information, see if generic * GPIO recovery is available */ if (!bri->scl_gpiod) { gpiod = devm_gpiod_get(dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN); if (PTR_ERR(gpiod) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto cleanup_pinctrl_state; } if (!IS_ERR(gpiod)) { bri->scl_gpiod = gpiod; bri->recover_bus = i2c_generic_scl_recovery; dev_info(dev, "using generic GPIOs for recovery\n"); } } /* SDA GPIOD line is optional, so we care about DEFER only */ if (!bri->sda_gpiod) { /* * We have SCL. Pull SCL low and wait a bit so that SDA glitches * have no effect. */ gpiod_direction_output(bri->scl_gpiod, 0); udelay(10); gpiod = devm_gpiod_get(dev, "sda", GPIOD_IN); /* Wait a bit in case of a SDA glitch, and then release SCL. */ udelay(10); gpiod_direction_output(bri->scl_gpiod, 1); if (PTR_ERR(gpiod) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto cleanup_pinctrl_state; } if (!IS_ERR(gpiod)) bri->sda_gpiod = gpiod; } cleanup_pinctrl_state: /* change the state of the pins back to their default state */ if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_default); return ret; } static int i2c_gpio_init_recovery(struct i2c_adapter *adap) { i2c_gpio_init_pinctrl_recovery(adap); return i2c_gpio_init_generic_recovery(adap); } static int i2c_init_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; bool is_error_level = true; char *err_str; if (!bri) return 0; if (i2c_gpio_init_recovery(adap) == -EPROBE_DEFER) return -EPROBE_DEFER; if (!bri->recover_bus) { err_str = "no suitable method provided"; is_error_level = false; goto err; } if (bri->scl_gpiod && bri->recover_bus == i2c_generic_scl_recovery) { bri->get_scl = get_scl_gpio_value; bri->set_scl = set_scl_gpio_value; if (bri->sda_gpiod) { bri->get_sda = get_sda_gpio_value; /* FIXME: add proper flag instead of '0' once available */ if (gpiod_get_direction(bri->sda_gpiod) == 0) bri->set_sda = set_sda_gpio_value; } } else if (bri->recover_bus == i2c_generic_scl_recovery) { /* Generic SCL recovery */ if (!bri->set_scl || !bri->get_scl) { err_str = "no {get|set}_scl() found"; goto err; } if (!bri->set_sda && !bri->get_sda) { err_str = "either get_sda() or set_sda() needed"; goto err; } } return 0; err: if (is_error_level) dev_err(&adap->dev, "Not using recovery: %s\n", err_str); else dev_dbg(&adap->dev, "Not using recovery: %s\n", err_str); adap->bus_recovery_info = NULL; return -EINVAL; } static int i2c_smbus_host_notify_to_irq(const struct i2c_client *client) { struct i2c_adapter *adap = client->adapter; unsigned int irq; if (!adap->host_notify_domain) return -ENXIO; if (client->flags & I2C_CLIENT_TEN) return -EINVAL; irq = irq_create_mapping(adap->host_notify_domain, client->addr); return irq > 0 ? irq : -ENXIO; } static int i2c_device_probe(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_driver *driver; bool do_power_on; int status; if (!client) return 0; client->irq = client->init_irq; if (!client->irq) { int irq = -ENOENT; if (client->flags & I2C_CLIENT_HOST_NOTIFY) { dev_dbg(dev, "Using Host Notify IRQ\n"); /* Keep adapter active when Host Notify is required */ pm_runtime_get_sync(&client->adapter->dev); irq = i2c_smbus_host_notify_to_irq(client); } else if (dev->of_node) { irq = of_irq_get_byname(dev->of_node, "irq"); if (irq == -EINVAL || irq == -ENODATA) irq = of_irq_get(dev->of_node, 0); } else if (ACPI_COMPANION(dev)) { bool wake_capable; irq = i2c_acpi_get_irq(client, &wake_capable); if (irq > 0 && wake_capable) client->flags |= I2C_CLIENT_WAKE; } if (irq == -EPROBE_DEFER) { status = irq; goto put_sync_adapter; } if (irq < 0) irq = 0; client->irq = irq; } driver = to_i2c_driver(dev->driver); /* * An I2C ID table is not mandatory, if and only if, a suitable OF * or ACPI ID table is supplied for the probing device. */ if (!driver->id_table && !acpi_driver_match_device(dev, dev->driver) && !i2c_of_match_device(dev->driver->of_match_table, client)) { status = -ENODEV; goto put_sync_adapter; } if (client->flags & I2C_CLIENT_WAKE) { int wakeirq; wakeirq = of_irq_get_byname(dev->of_node, "wakeup"); if (wakeirq == -EPROBE_DEFER) { status = wakeirq; goto put_sync_adapter; } device_init_wakeup(&client->dev, true); if (wakeirq > 0 && wakeirq != client->irq) status = dev_pm_set_dedicated_wake_irq(dev, wakeirq); else if (client->irq > 0) status = dev_pm_set_wake_irq(dev, client->irq); else status = 0; if (status) dev_warn(&client->dev, "failed to set up wakeup irq\n"); } dev_dbg(dev, "probe\n"); status = of_clk_set_defaults(dev->of_node, false); if (status < 0) goto err_clear_wakeup_irq; do_power_on = !i2c_acpi_waive_d0_probe(dev); status = dev_pm_domain_attach(&client->dev, do_power_on); if (status) goto err_clear_wakeup_irq; client->devres_group_id = devres_open_group(&client->dev, NULL, GFP_KERNEL); if (!client->devres_group_id) { status = -ENOMEM; goto err_detach_pm_domain; } if (driver->probe) status = driver->probe(client); else status = -EINVAL; /* * Note that we are not closing the devres group opened above so * even resources that were attached to the device after probe is * run are released when i2c_device_remove() is executed. This is * needed as some drivers would allocate additional resources, * for example when updating firmware. */ if (status) goto err_release_driver_resources; return 0; err_release_driver_resources: devres_release_group(&client->dev, client->devres_group_id); err_detach_pm_domain: dev_pm_domain_detach(&client->dev, do_power_on); err_clear_wakeup_irq: dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); put_sync_adapter: if (client->flags & I2C_CLIENT_HOST_NOTIFY) pm_runtime_put_sync(&client->adapter->dev); return status; } static void i2c_device_remove(struct device *dev) { struct i2c_client *client = to_i2c_client(dev); struct i2c_driver *driver; driver = to_i2c_driver(dev->driver); if (driver->remove) { dev_dbg(dev, "remove\n"); driver->remove(client); } devres_release_group(&client->dev, client->devres_group_id); dev_pm_domain_detach(&client->dev, true); dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); client->irq = 0; if (client->flags & I2C_CLIENT_HOST_NOTIFY) pm_runtime_put(&client->adapter->dev); } static void i2c_device_shutdown(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_driver *driver; if (!client || !dev->driver) return; driver = to_i2c_driver(dev->driver); if (driver->shutdown) driver->shutdown(client); else if (client->irq > 0) disable_irq(client->irq); } static void i2c_client_dev_release(struct device *dev) { kfree(to_i2c_client(dev)); } static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", dev->type == &i2c_client_type ? to_i2c_client(dev)->name : to_i2c_adapter(dev)->name); } static DEVICE_ATTR_RO(name); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { struct i2c_client *client = to_i2c_client(dev); int len; len = of_device_modalias(dev, buf, PAGE_SIZE); if (len != -ENODEV) return len; len = acpi_device_modalias(dev, buf, PAGE_SIZE - 1); if (len != -ENODEV) return len; return sprintf(buf, "%s%s\n", I2C_MODULE_PREFIX, client->name); } static DEVICE_ATTR_RO(modalias); static struct attribute *i2c_dev_attrs[] = { &dev_attr_name.attr, /* modalias helps coldplug: modprobe $(cat .../modalias) */ &dev_attr_modalias.attr, NULL }; ATTRIBUTE_GROUPS(i2c_dev); const struct bus_type i2c_bus_type = { .name = "i2c", .match = i2c_device_match, .probe = i2c_device_probe, .remove = i2c_device_remove, .shutdown = i2c_device_shutdown, }; EXPORT_SYMBOL_GPL(i2c_bus_type); struct device_type i2c_client_type = { .groups = i2c_dev_groups, .uevent = i2c_device_uevent, .release = i2c_client_dev_release, }; EXPORT_SYMBOL_GPL(i2c_client_type); /** * i2c_verify_client - return parameter as i2c_client, or NULL * @dev: device, probably from some driver model iterator * * When traversing the driver model tree, perhaps using driver model * iterators like @device_for_each_child(), you can't assume very much * about the nodes you find. Use this function to avoid oopses caused * by wrongly treating some non-I2C device as an i2c_client. */ struct i2c_client *i2c_verify_client(struct device *dev) { return (dev->type == &i2c_client_type) ? to_i2c_client(dev) : NULL; } EXPORT_SYMBOL(i2c_verify_client); /* Return a unique address which takes the flags of the client into account */ static unsigned short i2c_encode_flags_to_addr(struct i2c_client *client) { unsigned short addr = client->addr; /* For some client flags, add an arbitrary offset to avoid collisions */ if (client->flags & I2C_CLIENT_TEN) addr |= I2C_ADDR_OFFSET_TEN_BIT; if (client->flags & I2C_CLIENT_SLAVE) addr |= I2C_ADDR_OFFSET_SLAVE; return addr; } /* This is a permissive address validity check, I2C address map constraints * are purposely not enforced, except for the general call address. */ static int i2c_check_addr_validity(unsigned int addr, unsigned short flags) { if (flags & I2C_CLIENT_TEN) { /* 10-bit address, all values are valid */ if (addr > 0x3ff) return -EINVAL; } else { /* 7-bit address, reject the general call address */ if (addr == 0x00 || addr > 0x7f) return -EINVAL; } return 0; } /* And this is a strict address validity check, used when probing. If a * device uses a reserved address, then it shouldn't be probed. 7-bit * addressing is assumed, 10-bit address devices are rare and should be * explicitly enumerated. */ int i2c_check_7bit_addr_validity_strict(unsigned short addr) { /* * Reserved addresses per I2C specification: * 0x00 General call address / START byte * 0x01 CBUS address * 0x02 Reserved for different bus format * 0x03 Reserved for future purposes * 0x04-0x07 Hs-mode master code * 0x78-0x7b 10-bit slave addressing * 0x7c-0x7f Reserved for future purposes */ if (addr < 0x08 || addr > 0x77) return -EINVAL; return 0; } static int __i2c_check_addr_busy(struct device *dev, void *addrp) { struct i2c_client *client = i2c_verify_client(dev); int addr = *(int *)addrp; if (client && i2c_encode_flags_to_addr(client) == addr) return -EBUSY; return 0; } /* walk up mux tree */ static int i2c_check_mux_parents(struct i2c_adapter *adapter, int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result; result = device_for_each_child(&adapter->dev, &addr, __i2c_check_addr_busy); if (!result && parent) result = i2c_check_mux_parents(parent, addr); return result; } /* recurse down mux tree */ static int i2c_check_mux_children(struct device *dev, void *addrp) { int result; if (dev->type == &i2c_adapter_type) result = device_for_each_child(dev, addrp, i2c_check_mux_children); else result = __i2c_check_addr_busy(dev, addrp); return result; } static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result = 0; if (parent) result = i2c_check_mux_parents(parent, addr); if (!result) result = device_for_each_child(&adapter->dev, &addr, i2c_check_mux_children); return result; } /** * i2c_adapter_lock_bus - Get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT * locks only this branch in the adapter tree */ static void i2c_adapter_lock_bus(struct i2c_adapter *adapter, unsigned int flags) { rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); } /** * i2c_adapter_trylock_bus - Try to get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER trylocks the root i2c adapter, I2C_LOCK_SEGMENT * trylocks only this branch in the adapter tree */ static int i2c_adapter_trylock_bus(struct i2c_adapter *adapter, unsigned int flags) { return rt_mutex_trylock(&adapter->bus_lock); } /** * i2c_adapter_unlock_bus - Release exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT * unlocks only this branch in the adapter tree */ static void i2c_adapter_unlock_bus(struct i2c_adapter *adapter, unsigned int flags) { rt_mutex_unlock(&adapter->bus_lock); } static void i2c_dev_set_name(struct i2c_adapter *adap, struct i2c_client *client, struct i2c_board_info const *info) { struct acpi_device *adev = ACPI_COMPANION(&client->dev); if (info && info->dev_name) { dev_set_name(&client->dev, "i2c-%s", info->dev_name); return; } if (adev) { dev_set_name(&client->dev, "i2c-%s", acpi_dev_name(adev)); return; } dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), i2c_encode_flags_to_addr(client)); } int i2c_dev_irq_from_resources(const struct resource *resources, unsigned int num_resources) { struct irq_data *irqd; int i; for (i = 0; i < num_resources; i++) { const struct resource *r = &resources[i]; if (resource_type(r) != IORESOURCE_IRQ) continue; if (r->flags & IORESOURCE_BITS) { irqd = irq_get_irq_data(r->start); if (!irqd) break; irqd_set_trigger_type(irqd, r->flags & IORESOURCE_BITS); } return r->start; } return 0; } /** * i2c_new_client_device - instantiate an i2c device * @adap: the adapter managing the device * @info: describes one I2C device; bus_num is ignored * Context: can sleep * * Create an i2c device. Binding is handled through driver model * probe()/remove() methods. A driver may be bound to this device when we * return from this function, or any later moment (e.g. maybe hotplugging will * load the driver module). This call is not appropriate for use by mainboard * initialization logic, which usually runs during an arch_initcall() long * before any i2c_adapter could exist. * * This returns the new i2c client, which may be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client * i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info) { struct i2c_client *client; bool need_put = false; int status; client = kzalloc(sizeof *client, GFP_KERNEL); if (!client) return ERR_PTR(-ENOMEM); client->adapter = adap; client->dev.platform_data = info->platform_data; client->flags = info->flags; client->addr = info->addr; client->init_irq = info->irq; if (!client->init_irq) client->init_irq = i2c_dev_irq_from_resources(info->resources, info->num_resources); strscpy(client->name, info->type, sizeof(client->name)); status = i2c_check_addr_validity(client->addr, client->flags); if (status) { dev_err(&adap->dev, "Invalid %d-bit I2C address 0x%02hx\n", client->flags & I2C_CLIENT_TEN ? 10 : 7, client->addr); goto out_err_silent; } /* Check for address business */ status = i2c_check_addr_busy(adap, i2c_encode_flags_to_addr(client)); if (status) goto out_err; client->dev.parent = &client->adapter->dev; client->dev.bus = &i2c_bus_type; client->dev.type = &i2c_client_type; client->dev.of_node = of_node_get(info->of_node); client->dev.fwnode = info->fwnode; device_enable_async_suspend(&client->dev); if (info->swnode) { status = device_add_software_node(&client->dev, info->swnode); if (status) { dev_err(&adap->dev, "Failed to add software node to client %s: %d\n", client->name, status); goto out_err_put_of_node; } } i2c_dev_set_name(adap, client, info); status = device_register(&client->dev); if (status) goto out_remove_swnode; dev_dbg(&adap->dev, "client [%s] registered with bus id %s\n", client->name, dev_name(&client->dev)); return client; out_remove_swnode: device_remove_software_node(&client->dev); need_put = true; out_err_put_of_node: of_node_put(info->of_node); out_err: dev_err(&adap->dev, "Failed to register i2c client %s at 0x%02x (%d)\n", client->name, client->addr, status); out_err_silent: if (need_put) put_device(&client->dev); else kfree(client); return ERR_PTR(status); } EXPORT_SYMBOL_GPL(i2c_new_client_device); /** * i2c_unregister_device - reverse effect of i2c_new_*_device() * @client: value returned from i2c_new_*_device() * Context: can sleep */ void i2c_unregister_device(struct i2c_client *client) { if (IS_ERR_OR_NULL(client)) return; if (client->dev.of_node) { of_node_clear_flag(client->dev.of_node, OF_POPULATED); of_node_put(client->dev.of_node); } if (ACPI_COMPANION(&client->dev)) acpi_device_clear_enumerated(ACPI_COMPANION(&client->dev)); device_remove_software_node(&client->dev); device_unregister(&client->dev); } EXPORT_SYMBOL_GPL(i2c_unregister_device); /** * i2c_find_device_by_fwnode() - find an i2c_client for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_client * * Look up and return the &struct i2c_client corresponding to the @fwnode. * If no client can be found, or @fwnode is NULL, this returns NULL. * * The user must call put_device(&client->dev) once done with the i2c client. */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_client *client; struct device *dev; if (!fwnode) return NULL; dev = bus_find_device_by_fwnode(&i2c_bus_type, fwnode); if (!dev) return NULL; client = i2c_verify_client(dev); if (!client) put_device(dev); return client; } EXPORT_SYMBOL(i2c_find_device_by_fwnode); static const struct i2c_device_id dummy_id[] = { { "dummy", 0 }, { }, }; static int dummy_probe(struct i2c_client *client) { return 0; } static struct i2c_driver dummy_driver = { .driver.name = "dummy", .probe = dummy_probe, .id_table = dummy_id, }; /** * i2c_new_dummy_device - return a new i2c device bound to a dummy driver * @adapter: the adapter managing the device * @address: seven bit address to be used * Context: can sleep * * This returns an I2C client bound to the "dummy" driver, intended for use * with devices that consume multiple addresses. Examples of such chips * include various EEPROMS (like 24c04 and 24c08 models). * * These dummy devices have two main uses. First, most I2C and SMBus calls * except i2c_transfer() need a client handle; the dummy will be that handle. * And second, this prevents the specified address from being bound to a * different driver. * * This returns the new i2c client, which should be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client *i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address) { struct i2c_board_info info = { I2C_BOARD_INFO("dummy", address), }; return i2c_new_client_device(adapter, &info); } EXPORT_SYMBOL_GPL(i2c_new_dummy_device); static void devm_i2c_release_dummy(void *client) { i2c_unregister_device(client); } /** * devm_i2c_new_dummy_device - return a new i2c device bound to a dummy driver * @dev: device the managed resource is bound to * @adapter: the adapter managing the device * @address: seven bit address to be used * Context: can sleep * * This is the device-managed version of @i2c_new_dummy_device. It returns the * new i2c client or an ERR_PTR in case of an error. */ struct i2c_client *devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adapter, u16 address) { struct i2c_client *client; int ret; client = i2c_new_dummy_device(adapter, address); if (IS_ERR(client)) return client; ret = devm_add_action_or_reset(dev, devm_i2c_release_dummy, client); if (ret) return ERR_PTR(ret); return client; } EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); /** * i2c_new_ancillary_device - Helper to get the instantiated secondary address * and create the associated device * @client: Handle to the primary client * @name: Handle to specify which secondary address to get * @default_addr: Used as a fallback if no secondary address was specified * Context: can sleep * * I2C clients can be composed of multiple I2C slaves bound together in a single * component. The I2C client driver then binds to the master I2C slave and needs * to create I2C dummy clients to communicate with all the other slaves. * * This function creates and returns an I2C dummy client whose I2C address is * retrieved from the platform firmware based on the given slave name. If no * address is specified by the firmware default_addr is used. * * On DT-based platforms the address is retrieved from the "reg" property entry * cell whose "reg-names" value matches the slave name. * * This returns the new i2c client, which should be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client *i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr) { struct device_node *np = client->dev.of_node; u32 addr = default_addr; int i; if (np) { i = of_property_match_string(np, "reg-names", name); if (i >= 0) of_property_read_u32_index(np, "reg", i, &addr); } dev_dbg(&client->adapter->dev, "Address for %s : 0x%x\n", name, addr); return i2c_new_dummy_device(client->adapter, addr); } EXPORT_SYMBOL_GPL(i2c_new_ancillary_device); /* ------------------------------------------------------------------------- */ /* I2C bus adapters -- one roots each I2C or SMBUS segment */ static void i2c_adapter_dev_release(struct device *dev) { struct i2c_adapter *adap = to_i2c_adapter(dev); complete(&adap->dev_released); } unsigned int i2c_adapter_depth(struct i2c_adapter *adapter) { unsigned int depth = 0; struct device *parent; for (parent = adapter->dev.parent; parent; parent = parent->parent) if (parent->type == &i2c_adapter_type) depth++; WARN_ONCE(depth >= MAX_LOCKDEP_SUBCLASSES, "adapter depth exceeds lockdep subclass limit\n"); return depth; } EXPORT_SYMBOL_GPL(i2c_adapter_depth); /* * Let users instantiate I2C devices through sysfs. This can be used when * platform initialization code doesn't contain the proper data for * whatever reason. Also useful for drivers that do device detection and * detection fails, either because the device uses an unexpected address, * or this is a compatible device with different ID register values. * * Parameter checking may look overzealous, but we really don't want * the user to provide incorrect parameters. */ static ssize_t new_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); struct i2c_board_info info; struct i2c_client *client; char *blank, end; int res; memset(&info, 0, sizeof(struct i2c_board_info)); blank = strchr(buf, ' '); if (!blank) { dev_err(dev, "%s: Missing parameters\n", "new_device"); return -EINVAL; } if (blank - buf > I2C_NAME_SIZE - 1) { dev_err(dev, "%s: Invalid device name\n", "new_device"); return -EINVAL; } memcpy(info.type, buf, blank - buf); /* Parse remaining parameters, reject extra parameters */ res = sscanf(++blank, "%hi%c", &info.addr, &end); if (res < 1) { dev_err(dev, "%s: Can't parse I2C address\n", "new_device"); return -EINVAL; } if (res > 1 && end != '\n') { dev_err(dev, "%s: Extra parameters\n", "new_device"); return -EINVAL; } if ((info.addr & I2C_ADDR_OFFSET_TEN_BIT) == I2C_ADDR_OFFSET_TEN_BIT) { info.addr &= ~I2C_ADDR_OFFSET_TEN_BIT; info.flags |= I2C_CLIENT_TEN; } if (info.addr & I2C_ADDR_OFFSET_SLAVE) { info.addr &= ~I2C_ADDR_OFFSET_SLAVE; info.flags |= I2C_CLIENT_SLAVE; } client = i2c_new_client_device(adap, &info); if (IS_ERR(client)) return PTR_ERR(client); /* Keep track of the added device */ mutex_lock(&adap->userspace_clients_lock); list_add_tail(&client->detected, &adap->userspace_clients); mutex_unlock(&adap->userspace_clients_lock); dev_info(dev, "%s: Instantiated device %s at 0x%02hx\n", "new_device", info.type, info.addr); return count; } static DEVICE_ATTR_WO(new_device); /* * And of course let the users delete the devices they instantiated, if * they got it wrong. This interface can only be used to delete devices * instantiated by i2c_sysfs_new_device above. This guarantees that we * don't delete devices to which some kernel code still has references. * * Parameter checking may look overzealous, but we really don't want * the user to delete the wrong device. */ static ssize_t delete_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); struct i2c_client *client, *next; unsigned short addr; char end; int res; /* Parse parameters, reject extra parameters */ res = sscanf(buf, "%hi%c", &addr, &end); if (res < 1) { dev_err(dev, "%s: Can't parse I2C address\n", "delete_device"); return -EINVAL; } if (res > 1 && end != '\n') { dev_err(dev, "%s: Extra parameters\n", "delete_device"); return -EINVAL; } /* Make sure the device was added through sysfs */ res = -ENOENT; mutex_lock_nested(&adap->userspace_clients_lock, i2c_adapter_depth(adap)); list_for_each_entry_safe(client, next, &adap->userspace_clients, detected) { if (i2c_encode_flags_to_addr(client) == addr) { dev_info(dev, "%s: Deleting device %s at 0x%02hx\n", "delete_device", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); res = count; break; } } mutex_unlock(&adap->userspace_clients_lock); if (res < 0) dev_err(dev, "%s: Can't find device in list\n", "delete_device"); return res; } static DEVICE_ATTR_IGNORE_LOCKDEP(delete_device, S_IWUSR, NULL, delete_device_store); static struct attribute *i2c_adapter_attrs[] = { &dev_attr_name.attr, &dev_attr_new_device.attr, &dev_attr_delete_device.attr, NULL }; ATTRIBUTE_GROUPS(i2c_adapter); struct device_type i2c_adapter_type = { .groups = i2c_adapter_groups, .release = i2c_adapter_dev_release, }; EXPORT_SYMBOL_GPL(i2c_adapter_type); /** * i2c_verify_adapter - return parameter as i2c_adapter or NULL * @dev: device, probably from some driver model iterator * * When traversing the driver model tree, perhaps using driver model * iterators like @device_for_each_child(), you can't assume very much * about the nodes you find. Use this function to avoid oopses caused * by wrongly treating some non-I2C device as an i2c_adapter. */ struct i2c_adapter *i2c_verify_adapter(struct device *dev) { return (dev->type == &i2c_adapter_type) ? to_i2c_adapter(dev) : NULL; } EXPORT_SYMBOL(i2c_verify_adapter); #ifdef CONFIG_I2C_COMPAT static struct class_compat *i2c_adapter_compat_class; #endif static void i2c_scan_static_board_info(struct i2c_adapter *adapter) { struct i2c_devinfo *devinfo; down_read(&__i2c_board_lock); list_for_each_entry(devinfo, &__i2c_board_list, list) { if (devinfo->busnum == adapter->nr && IS_ERR(i2c_new_client_device(adapter, &devinfo->board_info))) dev_err(&adapter->dev, "Can't create device at 0x%02x\n", devinfo->board_info.addr); } up_read(&__i2c_board_lock); } static int i2c_do_add_adapter(struct i2c_driver *driver, struct i2c_adapter *adap) { /* Detect supported devices on that bus, and instantiate them */ i2c_detect(adap, driver); return 0; } static int __process_new_adapter(struct device_driver *d, void *data) { return i2c_do_add_adapter(to_i2c_driver(d), data); } static const struct i2c_lock_operations i2c_adapter_lock_ops = { .lock_bus = i2c_adapter_lock_bus, .trylock_bus = i2c_adapter_trylock_bus, .unlock_bus = i2c_adapter_unlock_bus, }; static void i2c_host_notify_irq_teardown(struct i2c_adapter *adap) { struct irq_domain *domain = adap->host_notify_domain; irq_hw_number_t hwirq; if (!domain) return; for (hwirq = 0 ; hwirq < I2C_ADDR_7BITS_COUNT ; hwirq++) irq_dispose_mapping(irq_find_mapping(domain, hwirq)); irq_domain_remove(domain); adap->host_notify_domain = NULL; } static int i2c_host_notify_irq_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw_irq_num) { irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); return 0; } static const struct irq_domain_ops i2c_host_notify_irq_ops = { .map = i2c_host_notify_irq_map, }; static int i2c_setup_host_notify_irq_domain(struct i2c_adapter *adap) { struct irq_domain *domain; if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_HOST_NOTIFY)) return 0; domain = irq_domain_create_linear(adap->dev.parent->fwnode, I2C_ADDR_7BITS_COUNT, &i2c_host_notify_irq_ops, adap); if (!domain) return -ENOMEM; adap->host_notify_domain = domain; return 0; } /** * i2c_handle_smbus_host_notify - Forward a Host Notify event to the correct * I2C client. * @adap: the adapter * @addr: the I2C address of the notifying device * Context: can't sleep * * Helper function to be called from an I2C bus driver's interrupt * handler. It will schedule the Host Notify IRQ. */ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) { int irq; if (!adap) return -EINVAL; irq = irq_find_mapping(adap->host_notify_domain, addr); if (irq <= 0) return -ENXIO; generic_handle_irq_safe(irq); return 0; } EXPORT_SYMBOL_GPL(i2c_handle_smbus_host_notify); static int i2c_register_adapter(struct i2c_adapter *adap) { int res = -EINVAL; /* Can't register until after driver model init */ if (WARN_ON(!is_registered)) { res = -EAGAIN; goto out_list; } /* Sanity checks */ if (WARN(!adap->name[0], "i2c adapter has no name")) goto out_list; if (!adap->algo) { pr_err("adapter '%s': no algo supplied!\n", adap->name); goto out_list; } if (!adap->lock_ops) adap->lock_ops = &i2c_adapter_lock_ops; adap->locked_flags = 0; rt_mutex_init(&adap->bus_lock); rt_mutex_init(&adap->mux_lock); mutex_init(&adap->userspace_clients_lock); INIT_LIST_HEAD(&adap->userspace_clients); /* Set default timeout to 1 second if not already set */ if (adap->timeout == 0) adap->timeout = HZ; /* register soft irqs for Host Notify */ res = i2c_setup_host_notify_irq_domain(adap); if (res) { pr_err("adapter '%s': can't create Host Notify IRQs (%d)\n", adap->name, res); goto out_list; } dev_set_name(&adap->dev, "i2c-%d", adap->nr); adap->dev.bus = &i2c_bus_type; adap->dev.type = &i2c_adapter_type; res = device_register(&adap->dev); if (res) { pr_err("adapter '%s': can't register device (%d)\n", adap->name, res); goto out_list; } adap->debugfs = debugfs_create_dir(dev_name(&adap->dev), i2c_debugfs_root); res = i2c_setup_smbus_alert(adap); if (res) goto out_reg; device_enable_async_suspend(&adap->dev); pm_runtime_no_callbacks(&adap->dev); pm_suspend_ignore_children(&adap->dev, true); pm_runtime_enable(&adap->dev); res = i2c_init_recovery(adap); if (res == -EPROBE_DEFER) goto out_reg; dev_dbg(&adap->dev, "adapter [%s] registered\n", adap->name); #ifdef CONFIG_I2C_COMPAT res = class_compat_create_link(i2c_adapter_compat_class, &adap->dev, adap->dev.parent); if (res) dev_warn(&adap->dev, "Failed to create compatibility class link\n"); #endif /* create pre-declared device nodes */ of_i2c_register_devices(adap); i2c_acpi_install_space_handler(adap); i2c_acpi_register_devices(adap); if (adap->nr < __i2c_first_dynamic_bus_num) i2c_scan_static_board_info(adap); /* Notify drivers */ mutex_lock(&core_lock); bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_new_adapter); mutex_unlock(&core_lock); return 0; out_reg: debugfs_remove_recursive(adap->debugfs); init_completion(&adap->dev_released); device_unregister(&adap->dev); wait_for_completion(&adap->dev_released); out_list: mutex_lock(&core_lock); idr_remove(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); return res; } /** * __i2c_add_numbered_adapter - i2c_add_numbered_adapter where nr is never -1 * @adap: the adapter to register (with adap->nr initialized) * Context: can sleep * * See i2c_add_numbered_adapter() for details. */ static int __i2c_add_numbered_adapter(struct i2c_adapter *adap) { int id; mutex_lock(&core_lock); id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1, GFP_KERNEL); mutex_unlock(&core_lock); if (WARN(id < 0, "couldn't get idr")) return id == -ENOSPC ? -EBUSY : id; return i2c_register_adapter(adap); } /** * i2c_add_adapter - declare i2c adapter, use dynamic bus number * @adapter: the adapter to add * Context: can sleep * * This routine is used to declare an I2C adapter when its bus number * doesn't matter or when its bus number is specified by an dt alias. * Examples of bases when the bus number doesn't matter: I2C adapters * dynamically added by USB links or PCI plugin cards. * * When this returns zero, a new bus number was allocated and stored * in adap->nr, and the specified adapter became available for clients. * Otherwise, a negative errno value is returned. */ int i2c_add_adapter(struct i2c_adapter *adapter) { struct device *dev = &adapter->dev; int id; if (dev->of_node) { id = of_alias_get_id(dev->of_node, "i2c"); if (id >= 0) { adapter->nr = id; return __i2c_add_numbered_adapter(adapter); } } mutex_lock(&core_lock); id = idr_alloc(&i2c_adapter_idr, adapter, __i2c_first_dynamic_bus_num, 0, GFP_KERNEL); mutex_unlock(&core_lock); if (WARN(id < 0, "couldn't get idr")) return id; adapter->nr = id; return i2c_register_adapter(adapter); } EXPORT_SYMBOL(i2c_add_adapter); /** * i2c_add_numbered_adapter - declare i2c adapter, use static bus number * @adap: the adapter to register (with adap->nr initialized) * Context: can sleep * * This routine is used to declare an I2C adapter when its bus number * matters. For example, use it for I2C adapters from system-on-chip CPUs, * or otherwise built in to the system's mainboard, and where i2c_board_info * is used to properly configure I2C devices. * * If the requested bus number is set to -1, then this function will behave * identically to i2c_add_adapter, and will dynamically assign a bus number. * * If no devices have pre-been declared for this bus, then be sure to * register the adapter before any dynamically allocated ones. Otherwise * the required bus ID may not be available. * * When this returns zero, the specified adapter became available for * clients using the bus number provided in adap->nr. Also, the table * of I2C devices pre-declared using i2c_register_board_info() is scanned, * and the appropriate driver model device nodes are created. Otherwise, a * negative errno value is returned. */ int i2c_add_numbered_adapter(struct i2c_adapter *adap) { if (adap->nr == -1) /* -1 means dynamically assign bus id */ return i2c_add_adapter(adap); return __i2c_add_numbered_adapter(adap); } EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter); static void i2c_do_del_adapter(struct i2c_driver *driver, struct i2c_adapter *adapter) { struct i2c_client *client, *_n; /* Remove the devices we created ourselves as the result of hardware * probing (using a driver's detect method) */ list_for_each_entry_safe(client, _n, &driver->clients, detected) { if (client->adapter == adapter) { dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); } } } static int __unregister_client(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); if (client && strcmp(client->name, "dummy")) i2c_unregister_device(client); return 0; } static int __unregister_dummy(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); i2c_unregister_device(client); return 0; } static int __process_removed_adapter(struct device_driver *d, void *data) { i2c_do_del_adapter(to_i2c_driver(d), data); return 0; } /** * i2c_del_adapter - unregister I2C adapter * @adap: the adapter being unregistered * Context: can sleep * * This unregisters an I2C adapter which was previously registered * by @i2c_add_adapter or @i2c_add_numbered_adapter. */ void i2c_del_adapter(struct i2c_adapter *adap) { struct i2c_adapter *found; struct i2c_client *client, *next; /* First make sure that this adapter was ever added */ mutex_lock(&core_lock); found = idr_find(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); if (found != adap) { pr_debug("attempting to delete unregistered adapter [%s]\n", adap->name); return; } i2c_acpi_remove_space_handler(adap); /* Tell drivers about this removal */ mutex_lock(&core_lock); bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_removed_adapter); mutex_unlock(&core_lock); /* Remove devices instantiated from sysfs */ mutex_lock_nested(&adap->userspace_clients_lock, i2c_adapter_depth(adap)); list_for_each_entry_safe(client, next, &adap->userspace_clients, detected) { dev_dbg(&adap->dev, "Removing %s at 0x%x\n", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); } mutex_unlock(&adap->userspace_clients_lock); /* Detach any active clients. This can't fail, thus we do not * check the returned value. This is a two-pass process, because * we can't remove the dummy devices during the first pass: they * could have been instantiated by real devices wishing to clean * them up properly, so we give them a chance to do that first. */ device_for_each_child(&adap->dev, NULL, __unregister_client); device_for_each_child(&adap->dev, NULL, __unregister_dummy); #ifdef CONFIG_I2C_COMPAT class_compat_remove_link(i2c_adapter_compat_class, &adap->dev, adap->dev.parent); #endif /* device name is gone after device_unregister */ dev_dbg(&adap->dev, "adapter [%s] unregistered\n", adap->name); pm_runtime_disable(&adap->dev); i2c_host_notify_irq_teardown(adap); debugfs_remove_recursive(adap->debugfs); /* wait until all references to the device are gone * * FIXME: This is old code and should ideally be replaced by an * alternative which results in decoupling the lifetime of the struct * device from the i2c_adapter, like spi or netdev do. Any solution * should be thoroughly tested with DEBUG_KOBJECT_RELEASE enabled! */ init_completion(&adap->dev_released); device_unregister(&adap->dev); wait_for_completion(&adap->dev_released); /* free bus id */ mutex_lock(&core_lock); idr_remove(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); /* Clear the device structure in case this adapter is ever going to be added again */ memset(&adap->dev, 0, sizeof(adap->dev)); } EXPORT_SYMBOL(i2c_del_adapter); static void devm_i2c_del_adapter(void *adapter) { i2c_del_adapter(adapter); } /** * devm_i2c_add_adapter - device-managed variant of i2c_add_adapter() * @dev: managing device for adding this I2C adapter * @adapter: the adapter to add * Context: can sleep * * Add adapter with dynamic bus number, same with i2c_add_adapter() * but the adapter will be auto deleted on driver detach. */ int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter) { int ret; ret = i2c_add_adapter(adapter); if (ret) return ret; return devm_add_action_or_reset(dev, devm_i2c_del_adapter, adapter); } EXPORT_SYMBOL_GPL(devm_i2c_add_adapter); static int i2c_dev_or_parent_fwnode_match(struct device *dev, const void *data) { if (dev_fwnode(dev) == data) return 1; if (dev->parent && dev_fwnode(dev->parent) == data) return 1; return 0; } /** * i2c_find_adapter_by_fwnode() - find an i2c_adapter for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_adapter * * Look up and return the &struct i2c_adapter corresponding to the @fwnode. * If no adapter can be found, or @fwnode is NULL, this returns NULL. * * The user must call put_device(&adapter->dev) once done with the i2c adapter. */ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_adapter *adapter; struct device *dev; if (!fwnode) return NULL; dev = bus_find_device(&i2c_bus_type, NULL, fwnode, i2c_dev_or_parent_fwnode_match); if (!dev) return NULL; adapter = i2c_verify_adapter(dev); if (!adapter) put_device(dev); return adapter; } EXPORT_SYMBOL(i2c_find_adapter_by_fwnode); /** * i2c_get_adapter_by_fwnode() - find an i2c_adapter for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_adapter * * Look up and return the &struct i2c_adapter corresponding to the @fwnode, * and increment the adapter module's use count. If no adapter can be found, * or @fwnode is NULL, this returns NULL. * * The user must call i2c_put_adapter(adapter) once done with the i2c adapter. * Note that this is different from i2c_find_adapter_by_node(). */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_adapter *adapter; adapter = i2c_find_adapter_by_fwnode(fwnode); if (!adapter) return NULL; if (!try_module_get(adapter->owner)) { put_device(&adapter->dev); adapter = NULL; } return adapter; } EXPORT_SYMBOL(i2c_get_adapter_by_fwnode); static void i2c_parse_timing(struct device *dev, char *prop_name, u32 *cur_val_p, u32 def_val, bool use_def) { int ret; ret = device_property_read_u32(dev, prop_name, cur_val_p); if (ret && use_def) *cur_val_p = def_val; dev_dbg(dev, "%s: %u\n", prop_name, *cur_val_p); } /** * i2c_parse_fw_timings - get I2C related timing parameters from firmware * @dev: The device to scan for I2C timing properties * @t: the i2c_timings struct to be filled with values * @use_defaults: bool to use sane defaults derived from the I2C specification * when properties are not found, otherwise don't update * * Scan the device for the generic I2C properties describing timing parameters * for the signal and fill the given struct with the results. If a property was * not found and use_defaults was true, then maximum timings are assumed which * are derived from the I2C specification. If use_defaults is not used, the * results will be as before, so drivers can apply their own defaults before * calling this helper. The latter is mainly intended for avoiding regressions * of existing drivers which want to switch to this function. New drivers * almost always should use the defaults. */ void i2c_parse_fw_timings(struct device *dev, struct i2c_timings *t, bool use_defaults) { bool u = use_defaults; u32 d; i2c_parse_timing(dev, "clock-frequency", &t->bus_freq_hz, I2C_MAX_STANDARD_MODE_FREQ, u); d = t->bus_freq_hz <= I2C_MAX_STANDARD_MODE_FREQ ? 1000 : t->bus_freq_hz <= I2C_MAX_FAST_MODE_FREQ ? 300 : 120; i2c_parse_timing(dev, "i2c-scl-rising-time-ns", &t->scl_rise_ns, d, u); d = t->bus_freq_hz <= I2C_MAX_FAST_MODE_FREQ ? 300 : 120; i2c_parse_timing(dev, "i2c-scl-falling-time-ns", &t->scl_fall_ns, d, u); i2c_parse_timing(dev, "i2c-scl-internal-delay-ns", &t->scl_int_delay_ns, 0, u); i2c_parse_timing(dev, "i2c-sda-falling-time-ns", &t->sda_fall_ns, t->scl_fall_ns, u); i2c_parse_timing(dev, "i2c-sda-hold-time-ns", &t->sda_hold_ns, 0, u); i2c_parse_timing(dev, "i2c-digital-filter-width-ns", &t->digital_filter_width_ns, 0, u); i2c_parse_timing(dev, "i2c-analog-filter-cutoff-frequency", &t->analog_filter_cutoff_freq_hz, 0, u); } EXPORT_SYMBOL_GPL(i2c_parse_fw_timings); /* ------------------------------------------------------------------------- */ int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data)) { int res; mutex_lock(&core_lock); res = bus_for_each_dev(&i2c_bus_type, NULL, data, fn); mutex_unlock(&core_lock); return res; } EXPORT_SYMBOL_GPL(i2c_for_each_dev); static int __process_new_driver(struct device *dev, void *data) { if (dev->type != &i2c_adapter_type) return 0; return i2c_do_add_adapter(data, to_i2c_adapter(dev)); } /* * An i2c_driver is used with one or more i2c_client (device) nodes to access * i2c slave chips, on a bus instance associated with some i2c_adapter. */ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) { int res; /* Can't register until after driver model init */ if (WARN_ON(!is_registered)) return -EAGAIN; /* add the driver to the list of i2c drivers in the driver core */ driver->driver.owner = owner; driver->driver.bus = &i2c_bus_type; INIT_LIST_HEAD(&driver->clients); /* When registration returns, the driver core * will have called probe() for all matching-but-unbound devices. */ res = driver_register(&driver->driver); if (res) return res; pr_debug("driver [%s] registered\n", driver->driver.name); /* Walk the adapters that are already present */ i2c_for_each_dev(driver, __process_new_driver); return 0; } EXPORT_SYMBOL(i2c_register_driver); static int __process_removed_driver(struct device *dev, void *data) { if (dev->type == &i2c_adapter_type) i2c_do_del_adapter(data, to_i2c_adapter(dev)); return 0; } /** * i2c_del_driver - unregister I2C driver * @driver: the driver being unregistered * Context: can sleep */ void i2c_del_driver(struct i2c_driver *driver) { i2c_for_each_dev(driver, __process_removed_driver); driver_unregister(&driver->driver); pr_debug("driver [%s] unregistered\n", driver->driver.name); } EXPORT_SYMBOL(i2c_del_driver); /* ------------------------------------------------------------------------- */ struct i2c_cmd_arg { unsigned cmd; void *arg; }; static int i2c_cmd(struct device *dev, void *_arg) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_cmd_arg *arg = _arg; struct i2c_driver *driver; if (!client || !client->dev.driver) return 0; driver = to_i2c_driver(client->dev.driver); if (driver->command) driver->command(client, arg->cmd, arg->arg); return 0; } void i2c_clients_command(struct i2c_adapter *adap, unsigned int cmd, void *arg) { struct i2c_cmd_arg cmd_arg; cmd_arg.cmd = cmd; cmd_arg.arg = arg; device_for_each_child(&adap->dev, &cmd_arg, i2c_cmd); } EXPORT_SYMBOL(i2c_clients_command); static int __init i2c_init(void) { int retval; retval = of_alias_get_highest_id("i2c"); down_write(&__i2c_board_lock); if (retval >= __i2c_first_dynamic_bus_num) __i2c_first_dynamic_bus_num = retval + 1; up_write(&__i2c_board_lock); retval = bus_register(&i2c_bus_type); if (retval) return retval; is_registered = true; i2c_debugfs_root = debugfs_create_dir("i2c", NULL); #ifdef CONFIG_I2C_COMPAT i2c_adapter_compat_class = class_compat_register("i2c-adapter"); if (!i2c_adapter_compat_class) { retval = -ENOMEM; goto bus_err; } #endif retval = i2c_add_driver(&dummy_driver); if (retval) goto class_err; if (IS_ENABLED(CONFIG_OF_DYNAMIC)) WARN_ON(of_reconfig_notifier_register(&i2c_of_notifier)); if (IS_ENABLED(CONFIG_ACPI)) WARN_ON(acpi_reconfig_notifier_register(&i2c_acpi_notifier)); return 0; class_err: #ifdef CONFIG_I2C_COMPAT class_compat_unregister(i2c_adapter_compat_class); bus_err: #endif is_registered = false; bus_unregister(&i2c_bus_type); return retval; } static void __exit i2c_exit(void) { if (IS_ENABLED(CONFIG_ACPI)) WARN_ON(acpi_reconfig_notifier_unregister(&i2c_acpi_notifier)); if (IS_ENABLED(CONFIG_OF_DYNAMIC)) WARN_ON(of_reconfig_notifier_unregister(&i2c_of_notifier)); i2c_del_driver(&dummy_driver); #ifdef CONFIG_I2C_COMPAT class_compat_unregister(i2c_adapter_compat_class); #endif debugfs_remove_recursive(i2c_debugfs_root); bus_unregister(&i2c_bus_type); tracepoint_synchronize_unregister(); } /* We must initialize early, because some subsystems register i2c drivers * in subsys_initcall() code, but are linked (and initialized) before i2c. */ postcore_initcall(i2c_init); module_exit(i2c_exit); /* ---------------------------------------------------- * the functional interface to the i2c busses. * ---------------------------------------------------- */ /* Check if val is exceeding the quirk IFF quirk is non 0 */ #define i2c_quirk_exceeded(val, quirk) ((quirk) && ((val) > (quirk))) static int i2c_quirk_error(struct i2c_adapter *adap, struct i2c_msg *msg, char *err_msg) { dev_err_ratelimited(&adap->dev, "adapter quirk: %s (addr 0x%04x, size %u, %s)\n", err_msg, msg->addr, msg->len, msg->flags & I2C_M_RD ? "read" : "write"); return -EOPNOTSUPP; } static int i2c_check_for_quirks(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { const struct i2c_adapter_quirks *q = adap->quirks; int max_num = q->max_num_msgs, i; bool do_len_check = true; if (q->flags & I2C_AQ_COMB) { max_num = 2; /* special checks for combined messages */ if (num == 2) { if (q->flags & I2C_AQ_COMB_WRITE_FIRST && msgs[0].flags & I2C_M_RD) return i2c_quirk_error(adap, &msgs[0], "1st comb msg must be write"); if (q->flags & I2C_AQ_COMB_READ_SECOND && !(msgs[1].flags & I2C_M_RD)) return i2c_quirk_error(adap, &msgs[1], "2nd comb msg must be read"); if (q->flags & I2C_AQ_COMB_SAME_ADDR && msgs[0].addr != msgs[1].addr) return i2c_quirk_error(adap, &msgs[0], "comb msg only to same addr"); if (i2c_quirk_exceeded(msgs[0].len, q->max_comb_1st_msg_len)) return i2c_quirk_error(adap, &msgs[0], "msg too long"); if (i2c_quirk_exceeded(msgs[1].len, q->max_comb_2nd_msg_len)) return i2c_quirk_error(adap, &msgs[1], "msg too long"); do_len_check = false; } } if (i2c_quirk_exceeded(num, max_num)) return i2c_quirk_error(adap, &msgs[0], "too many messages"); for (i = 0; i < num; i++) { u16 len = msgs[i].len; if (msgs[i].flags & I2C_M_RD) { if (do_len_check && i2c_quirk_exceeded(len, q->max_read_len)) return i2c_quirk_error(adap, &msgs[i], "msg too long"); if (q->flags & I2C_AQ_NO_ZERO_LEN_READ && len == 0) return i2c_quirk_error(adap, &msgs[i], "no zero length"); } else { if (do_len_check && i2c_quirk_exceeded(len, q->max_write_len)) return i2c_quirk_error(adap, &msgs[i], "msg too long"); if (q->flags & I2C_AQ_NO_ZERO_LEN_WRITE && len == 0) return i2c_quirk_error(adap, &msgs[i], "no zero length"); } } return 0; } /** * __i2c_transfer - unlocked flavor of i2c_transfer * @adap: Handle to I2C bus * @msgs: One or more messages to execute before STOP is issued to * terminate the operation; each message begins with a START. * @num: Number of messages to be executed. * * Returns negative errno, else the number of messages executed. * * Adapter lock must be held when calling this function. No debug logging * takes place. adap->algo->master_xfer existence isn't checked. */ int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { unsigned long orig_jiffies; int ret, try; if (WARN_ON(!msgs || num < 1)) return -EINVAL; ret = __i2c_check_suspended(adap); if (ret) return ret; if (adap->quirks && i2c_check_for_quirks(adap, msgs, num)) return -EOPNOTSUPP; /* * i2c_trace_msg_key gets enabled when tracepoint i2c_transfer gets * enabled. This is an efficient way of keeping the for-loop from * being executed when not needed. */ if (static_branch_unlikely(&i2c_trace_msg_key)) { int i; for (i = 0; i < num; i++) if (msgs[i].flags & I2C_M_RD) trace_i2c_read(adap, &msgs[i], i); else trace_i2c_write(adap, &msgs[i], i); } /* Retry automatically on arbitration loss */ orig_jiffies = jiffies; for (ret = 0, try = 0; try <= adap->retries; try++) { if (i2c_in_atomic_xfer_mode() && adap->algo->master_xfer_atomic) ret = adap->algo->master_xfer_atomic(adap, msgs, num); else ret = adap->algo->master_xfer(adap, msgs, num); if (ret != -EAGAIN) break; if (time_after(jiffies, orig_jiffies + adap->timeout)) break; } if (static_branch_unlikely(&i2c_trace_msg_key)) { int i; for (i = 0; i < ret; i++) if (msgs[i].flags & I2C_M_RD) trace_i2c_reply(adap, &msgs[i], i); trace_i2c_result(adap, num, ret); } return ret; } EXPORT_SYMBOL(__i2c_transfer); /** * i2c_transfer - execute a single or combined I2C message * @adap: Handle to I2C bus * @msgs: One or more messages to execute before STOP is issued to * terminate the operation; each message begins with a START. * @num: Number of messages to be executed. * * Returns negative errno, else the number of messages executed. * * Note that there is no requirement that each message be sent to * the same slave address, although that is the most common model. */ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { int ret; if (!adap->algo->master_xfer) { dev_dbg(&adap->dev, "I2C level transfers not supported\n"); return -EOPNOTSUPP; } /* REVISIT the fault reporting model here is weak: * * - When we get an error after receiving N bytes from a slave, * there is no way to report "N". * * - When we get a NAK after transmitting N bytes to a slave, * there is no way to report "N" ... or to let the master * continue executing the rest of this combined message, if * that's the appropriate response. * * - When for example "num" is two and we successfully complete * the first message but get an error part way through the * second, it's unclear whether that should be reported as * one (discarding status on the second message) or errno * (discarding status on the first one). */ ret = __i2c_lock_bus_helper(adap); if (ret) return ret; ret = __i2c_transfer(adap, msgs, num); i2c_unlock_bus(adap, I2C_LOCK_SEGMENT); return ret; } EXPORT_SYMBOL(i2c_transfer); /** * i2c_transfer_buffer_flags - issue a single I2C message transferring data * to/from a buffer * @client: Handle to slave device * @buf: Where the data is stored * @count: How many bytes to transfer, must be less than 64k since msg.len is u16 * @flags: The flags to be used for the message, e.g. I2C_M_RD for reads * * Returns negative errno, or else the number of bytes transferred. */ int i2c_transfer_buffer_flags(const struct i2c_client *client, char *buf, int count, u16 flags) { int ret; struct i2c_msg msg = { .addr = client->addr, .flags = flags | (client->flags & I2C_M_TEN), .len = count, .buf = buf, }; ret = i2c_transfer(client->adapter, &msg, 1); /* * If everything went ok (i.e. 1 msg transferred), return #bytes * transferred, else error code. */ return (ret == 1) ? count : ret; } EXPORT_SYMBOL(i2c_transfer_buffer_flags); /** * i2c_get_device_id - get manufacturer, part id and die revision of a device * @client: The device to query * @id: The queried information * * Returns negative errno on error, zero on success. */ int i2c_get_device_id(const struct i2c_client *client, struct i2c_device_identity *id) { struct i2c_adapter *adap = client->adapter; union i2c_smbus_data raw_id; int ret; if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_I2C_BLOCK)) return -EOPNOTSUPP; raw_id.block[0] = 3; ret = i2c_smbus_xfer(adap, I2C_ADDR_DEVICE_ID, 0, I2C_SMBUS_READ, client->addr << 1, I2C_SMBUS_I2C_BLOCK_DATA, &raw_id); if (ret) return ret; id->manufacturer_id = (raw_id.block[1] << 4) | (raw_id.block[2] >> 4); id->part_id = ((raw_id.block[2] & 0xf) << 5) | (raw_id.block[3] >> 3); id->die_revision = raw_id.block[3] & 0x7; return 0; } EXPORT_SYMBOL_GPL(i2c_get_device_id); /** * i2c_client_get_device_id - get the driver match table entry of a device * @client: the device to query. The device must be bound to a driver * * Returns a pointer to the matching entry if found, NULL otherwise. */ const struct i2c_device_id *i2c_client_get_device_id(const struct i2c_client *client) { const struct i2c_driver *drv = to_i2c_driver(client->dev.driver); return i2c_match_id(drv->id_table, client); } EXPORT_SYMBOL_GPL(i2c_client_get_device_id); /* ---------------------------------------------------- * the i2c address scanning function * Will not work for 10-bit addresses! * ---------------------------------------------------- */ /* * Legacy default probe function, mostly relevant for SMBus. The default * probe method is a quick write, but it is known to corrupt the 24RF08 * EEPROMs due to a state machine bug, and could also irreversibly * write-protect some EEPROMs, so for address ranges 0x30-0x37 and 0x50-0x5f, * we use a short byte read instead. Also, some bus drivers don't implement * quick write, so we fallback to a byte read in that case too. * On x86, there is another special case for FSC hardware monitoring chips, * which want regular byte reads (address 0x73.) Fortunately, these are the * only known chips using this I2C address on PC hardware. * Returns 1 if probe succeeded, 0 if not. */ static int i2c_default_probe(struct i2c_adapter *adap, unsigned short addr) { int err; union i2c_smbus_data dummy; #ifdef CONFIG_X86 if (addr == 0x73 && (adap->class & I2C_CLASS_HWMON) && i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE_DATA)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE_DATA, &dummy); else #endif if (!((addr & ~0x07) == 0x30 || (addr & ~0x0f) == 0x50) && i2c_check_functionality(adap, I2C_FUNC_SMBUS_QUICK)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_WRITE, 0, I2C_SMBUS_QUICK, NULL); else if (i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE, &dummy); else { dev_warn(&adap->dev, "No suitable probing method supported for address 0x%02X\n", addr); err = -EOPNOTSUPP; } return err >= 0; } static int i2c_detect_address(struct i2c_client *temp_client, struct i2c_driver *driver) { struct i2c_board_info info; struct i2c_adapter *adapter = temp_client->adapter; int addr = temp_client->addr; int err; /* Make sure the address is valid */ err = i2c_check_7bit_addr_validity_strict(addr); if (err) { dev_warn(&adapter->dev, "Invalid probe address 0x%02x\n", addr); return err; } /* Skip if already in use (7 bit, no need to encode flags) */ if (i2c_check_addr_busy(adapter, addr)) return 0; /* Make sure there is something at this address */ if (!i2c_default_probe(adapter, addr)) return 0; /* Finally call the custom detection function */ memset(&info, 0, sizeof(struct i2c_board_info)); info.addr = addr; err = driver->detect(temp_client, &info); if (err) { /* -ENODEV is returned if the detection fails. We catch it here as this isn't an error. */ return err == -ENODEV ? 0 : err; } /* Consistency check */ if (info.type[0] == '\0') { dev_err(&adapter->dev, "%s detection function provided no name for 0x%x\n", driver->driver.name, addr); } else { struct i2c_client *client; /* Detection succeeded, instantiate the device */ if (adapter->class & I2C_CLASS_DEPRECATED) dev_warn(&adapter->dev, "This adapter will soon drop class based instantiation of devices. " "Please make sure client 0x%02x gets instantiated by other means. " "Check 'Documentation/i2c/instantiating-devices.rst' for details.\n", info.addr); dev_dbg(&adapter->dev, "Creating %s at 0x%02x\n", info.type, info.addr); client = i2c_new_client_device(adapter, &info); if (!IS_ERR(client)) list_add_tail(&client->detected, &driver->clients); else dev_err(&adapter->dev, "Failed creating %s at 0x%02x\n", info.type, info.addr); } return 0; } static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) { const unsigned short *address_list; struct i2c_client *temp_client; int i, err = 0; address_list = driver->address_list; if (!driver->detect || !address_list) return 0; /* Warn that the adapter lost class based instantiation */ if (adapter->class == I2C_CLASS_DEPRECATED) { dev_dbg(&adapter->dev, "This adapter dropped support for I2C classes and won't auto-detect %s devices anymore. " "If you need it, check 'Documentation/i2c/instantiating-devices.rst' for alternatives.\n", driver->driver.name); return 0; } /* Stop here if the classes do not match */ if (!(adapter->class & driver->class)) return 0; /* Set up a temporary client to help detect callback */ temp_client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL); if (!temp_client) return -ENOMEM; temp_client->adapter = adapter; for (i = 0; address_list[i] != I2C_CLIENT_END; i += 1) { dev_dbg(&adapter->dev, "found normal entry for adapter %d, addr 0x%02x\n", i2c_adapter_id(adapter), address_list[i]); temp_client->addr = address_list[i]; err = i2c_detect_address(temp_client, driver); if (unlikely(err)) break; } kfree(temp_client); return err; } int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr) { return i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_QUICK, NULL) >= 0; } EXPORT_SYMBOL_GPL(i2c_probe_func_quick_read); struct i2c_client * i2c_new_scanned_device(struct i2c_adapter *adap, struct i2c_board_info *info, unsigned short const *addr_list, int (*probe)(struct i2c_adapter *adap, unsigned short addr)) { int i; if (!probe) probe = i2c_default_probe; for (i = 0; addr_list[i] != I2C_CLIENT_END; i++) { /* Check address validity */ if (i2c_check_7bit_addr_validity_strict(addr_list[i]) < 0) { dev_warn(&adap->dev, "Invalid 7-bit address 0x%02x\n", addr_list[i]); continue; } /* Check address availability (7 bit, no need to encode flags) */ if (i2c_check_addr_busy(adap, addr_list[i])) { dev_dbg(&adap->dev, "Address 0x%02x already in use, not probing\n", addr_list[i]); continue; } /* Test address responsiveness */ if (probe(adap, addr_list[i])) break; } if (addr_list[i] == I2C_CLIENT_END) { dev_dbg(&adap->dev, "Probing failed, no device found\n"); return ERR_PTR(-ENODEV); } info->addr = addr_list[i]; return i2c_new_client_device(adap, info); } EXPORT_SYMBOL_GPL(i2c_new_scanned_device); struct i2c_adapter *i2c_get_adapter(int nr) { struct i2c_adapter *adapter; mutex_lock(&core_lock); adapter = idr_find(&i2c_adapter_idr, nr); if (!adapter) goto exit; if (try_module_get(adapter->owner)) get_device(&adapter->dev); else adapter = NULL; exit: mutex_unlock(&core_lock); return adapter; } EXPORT_SYMBOL(i2c_get_adapter); void i2c_put_adapter(struct i2c_adapter *adap) { if (!adap) return; module_put(adap->owner); /* Should be last, otherwise we risk use-after-free with 'adap' */ put_device(&adap->dev); } EXPORT_SYMBOL(i2c_put_adapter); /** * i2c_get_dma_safe_msg_buf() - get a DMA safe buffer for the given i2c_msg * @msg: the message to be checked * @threshold: the minimum number of bytes for which using DMA makes sense. * Should at least be 1. * * Return: NULL if a DMA safe buffer was not obtained. Use msg->buf with PIO. * Or a valid pointer to be used with DMA. After use, release it by * calling i2c_put_dma_safe_msg_buf(). * * This function must only be called from process context! */ u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold) { /* also skip 0-length msgs for bogus thresholds of 0 */ if (!threshold) pr_debug("DMA buffer for addr=0x%02x with length 0 is bogus\n", msg->addr); if (msg->len < threshold || msg->len == 0) return NULL; if (msg->flags & I2C_M_DMA_SAFE) return msg->buf; pr_debug("using bounce buffer for addr=0x%02x, len=%d\n", msg->addr, msg->len); if (msg->flags & I2C_M_RD) return kzalloc(msg->len, GFP_KERNEL); else return kmemdup(msg->buf, msg->len, GFP_KERNEL); } EXPORT_SYMBOL_GPL(i2c_get_dma_safe_msg_buf); /** * i2c_put_dma_safe_msg_buf - release DMA safe buffer and sync with i2c_msg * @buf: the buffer obtained from i2c_get_dma_safe_msg_buf(). May be NULL. * @msg: the message which the buffer corresponds to * @xferred: bool saying if the message was transferred */ void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred) { if (!buf || buf == msg->buf) return; if (xferred && msg->flags & I2C_M_RD) memcpy(msg->buf, buf, msg->len); kfree(buf); } EXPORT_SYMBOL_GPL(i2c_put_dma_safe_msg_buf); MODULE_AUTHOR("Simon G. Vogl <simon@tk.uni-linz.ac.at>"); MODULE_DESCRIPTION("I2C-Bus main module"); MODULE_LICENSE("GPL"); |
3 3 3 3 2 2 4 4 4 3 2 1 2 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020-2023 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include "wext-compat.h" #include "nl80211.h" int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; const u8 *prev_bssid = NULL; int err, i; ASSERT_RTNL(); lockdep_assert_wiphy(wdev->wiphy); if (!netif_running(wdev->netdev)) return 0; wdev->wext.connect.ie = wdev->wext.ie; wdev->wext.connect.ie_len = wdev->wext.ie_len; /* Use default background scan period */ wdev->wext.connect.bg_scan_period = -1; if (wdev->wext.keys) { wdev->wext.keys->def = wdev->wext.default_key; if (wdev->wext.default_key != -1) wdev->wext.connect.privacy = true; } if (!wdev->wext.connect.ssid_len) return 0; if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } if (wdev->wext.prev_bssid_valid) prev_bssid = wdev->wext.prev_bssid; err = cfg80211_connect(rdev, wdev->netdev, &wdev->wext.connect, ck, prev_bssid); if (err) kfree_sensitive(ck); return err; } int cfg80211_mgd_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq) { chan = ieee80211_get_channel(wdev->wiphy, freq); if (!chan) return -EINVAL; if (chan->flags & IEEE80211_CHAN_DISABLED) return -EINVAL; } if (wdev->conn) { bool event = true; if (wdev->wext.connect.channel == chan) return 0; /* if SSID set, we'll try right again, avoid event */ if (wdev->wext.connect.ssid_len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.connect.channel = chan; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_channel *chan = NULL; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) chan = wdev->links[0].client.current_bss->pub.channel; else if (wdev->wext.connect.channel) chan = wdev->wext.connect.channel; if (chan) { freq->m = chan->center_freq; freq->e = 6; return 0; } /* no channel if not joining */ return -EINVAL; } int cfg80211_mgd_wext_siwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (!data->flags) len = 0; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; if (wdev->conn) { bool event = true; if (wdev->wext.connect.ssid && len && len == wdev->wext.connect.ssid_len && memcmp(wdev->wext.connect.ssid, ssid, len) == 0) return 0; /* if SSID set now, we'll try to connect, avoid event */ if (len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.prev_bssid_valid = false; wdev->wext.connect.ssid = wdev->wext.ssid; memcpy(wdev->wext.ssid, ssid, len); wdev->wext.connect.ssid_len = len; wdev->wext.connect.crypto.control_port = false; wdev->wext.connect.crypto.control_port_ethertype = cpu_to_be16(ETH_P_PAE); return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int ret = 0; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EINVAL; data->flags = 0; if (wdev->links[0].client.current_bss) { const struct element *ssid_elem; rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem( &wdev->links[0].client.current_bss->pub, WLAN_EID_SSID); if (ssid_elem) { data->flags = 1; data->length = ssid_elem->datalen; if (data->length > IW_ESSID_MAX_SIZE) ret = -EINVAL; else memcpy(ssid, ssid_elem->data, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { data->flags = 1; data->length = wdev->wext.connect.ssid_len; memcpy(ssid, wdev->wext.connect.ssid, data->length); } return ret; } int cfg80211_mgd_wext_siwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (ap_addr->sa_family != ARPHRD_ETHER) return -EINVAL; /* automatic mode */ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; if (wdev->conn) { /* both automatic */ if (!bssid && !wdev->wext.connect.bssid) return 0; /* fixed already - and no change */ if (wdev->wext.connect.bssid && bssid && ether_addr_equal(bssid, wdev->wext.connect.bssid)) return 0; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); if (err) return err; } if (bssid) { memcpy(wdev->wext.bssid, bssid, ETH_ALEN); wdev->wext.connect.bssid = wdev->wext.bssid; } else wdev->wext.connect.bssid = NULL; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; ap_addr->sa_family = ARPHRD_ETHER; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) memcpy(ap_addr->sa_data, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); return 0; } int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *ie = extra; int ie_len = data->length, err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ie_len) ie = NULL; wiphy_lock(wdev->wiphy); /* no change */ err = 0; if (wdev->wext.ie_len == ie_len && memcmp(wdev->wext.ie, ie, ie_len) == 0) goto out; if (ie_len) { ie = kmemdup(extra, ie_len, GFP_KERNEL); if (!ie) { err = -ENOMEM; goto out; } } else ie = NULL; kfree(wdev->wext.ie); wdev->wext.ie = ie; wdev->wext.ie_len = ie_len; if (wdev->conn) { err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); if (err) goto out; } /* userspace better not think we'll reconnect */ err = 0; out: wiphy_unlock(wdev->wiphy); return err; } int cfg80211_wext_siwmlme(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_mlme *mlme = (struct iw_mlme *)extra; struct cfg80211_registered_device *rdev; int err; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; wiphy_lock(&rdev->wiphy); switch (mlme->cmd) { case IW_MLME_DEAUTH: case IW_MLME_DISASSOC: err = cfg80211_disconnect(rdev, dev, mlme->reason_code, true); break; default: err = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return err; } |
515 769 1864 1863 1864 1864 5634 950 691 672 672 11 5633 5635 672 672 5634 5635 5635 983 5635 985 5634 5635 5634 2069 2069 2069 5635 5635 5635 5635 405 405 5635 1875 1865 1864 1865 5635 11 5635 5635 5636 3243 3242 5635 5636 5635 5635 54 54 54 11 11 11 11 230 230 230 229 229 229 230 5632 5631 320 5631 320 5631 54 5544 5632 5 5 5 5 5 5 531 529 529 530 200 531 64 64 63 531 266 266 155 3576 3577 3099 3577 3576 274 3575 3376 3377 3377 1500 1500 1500 1415 179 155 1412 3577 39 3576 5634 5618 5617 5619 1674 5616 5634 3577 3577 1174 1174 1174 1174 1174 1174 5634 5633 1592 5634 5616 5 5 3739 3738 3739 3738 3 3739 1402 3570 326 326 326 326 311 176 189 57 49 49 49 49 8 49 313 5635 5634 5632 5632 5632 61 60 60 15 60 61 5632 116 5632 5632 3575 3565 3430 3576 918 3564 3563 3576 3575 3575 3576 3576 3575 3574 308 307 3570 189 189 189 940 939 940 939 3574 3574 3573 3575 3575 3574 3575 3572 3569 3575 3570 184 3569 3575 1755 1755 1151 1750 1750 1629 1750 76 76 76 1749 1749 1755 1600 217 371 57 12 371 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 | // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/transaction.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * * Copyright 1998 Red Hat corp --- All Rights Reserved * * Generic filesystem transaction handling code; part of the ext2fs * journaling system. * * This file manages transactions (compound commits managed by the * journaling code) and handles (individual atomic operations by the * filesystem). */ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> #include <linux/backing-dev.h> #include <linux/bug.h> #include <linux/module.h> #include <linux/sched/mm.h> #include <trace/events/jbd2.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); static struct kmem_cache *transaction_cache; int __init jbd2_journal_init_transaction_cache(void) { J_ASSERT(!transaction_cache); transaction_cache = kmem_cache_create("jbd2_transaction_s", sizeof(transaction_t), 0, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, NULL); if (!transaction_cache) { pr_emerg("JBD2: failed to create transaction cache\n"); return -ENOMEM; } return 0; } void jbd2_journal_destroy_transaction_cache(void) { kmem_cache_destroy(transaction_cache); transaction_cache = NULL; } void jbd2_journal_free_transaction(transaction_t *transaction) { if (unlikely(ZERO_OR_NULL_PTR(transaction))) return; kmem_cache_free(transaction_cache, transaction); } /* * Base amount of descriptor blocks we reserve for each transaction. */ static int jbd2_descriptor_blocks_per_trans(journal_t *journal) { int tag_space = journal->j_blocksize - sizeof(journal_header_t); int tags_per_block; /* Subtract UUID */ tag_space -= 16; if (jbd2_journal_has_csum_v2or3(journal)) tag_space -= sizeof(struct jbd2_journal_block_tail); /* Commit code leaves a slack space of 16 bytes at the end of block */ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); /* * Revoke descriptors are accounted separately so we need to reserve * space for commit block and normal transaction descriptor blocks. */ return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers, tags_per_block); } /* * jbd2_get_transaction: obtain a new transaction_t object. * * Simply initialise a new transaction. Initialize it in * RUNNING state and add it to the current journal (which should not * have an existing running transaction: we only make a new transaction * once we have started to commit the old one). * * Preconditions: * The journal MUST be locked. We don't perform atomic mallocs on the * new transaction and we can't block without protecting against other * processes trying to touch the journal while it is in transition. * */ static void jbd2_get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_outstanding_credits, jbd2_descriptor_blocks_per_trans(journal) + atomic_read(&journal->j_reserved_credits)); atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); journal->j_running_transaction = transaction; transaction->t_max_wait = 0; transaction->t_start = jiffies; transaction->t_requested = 0; } /* * Handle management. * * A handle_t is an object which represents a single atomic update to a * filesystem, and which tracks all of the modifications which form part * of that one update. */ /* * Update transaction's maximum wait time, if debugging is enabled. * * t_max_wait is carefully updated here with use of atomic compare exchange. * Note that there could be multiplre threads trying to do this simultaneously * hence using cmpxchg to avoid any use of locks in this case. * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug. */ static inline void update_t_max_wait(transaction_t *transaction, unsigned long ts) { unsigned long oldts, newts; if (time_after(transaction->t_start, ts)) { newts = jbd2_time_diff(ts, transaction->t_start); oldts = READ_ONCE(transaction->t_max_wait); while (oldts < newts) oldts = cmpxchg(&transaction->t_max_wait, oldts, newts); } } /* * Wait until running transaction passes to T_FLUSH state and new transaction * can thus be started. Also starts the commit if needed. The function expects * running transaction to exist and releases j_state_lock. */ static void wait_transaction_locked(journal_t *journal) __releases(journal->j_state_lock) { DEFINE_WAIT(wait); int need_to_start; tid_t tid = journal->j_running_transaction->t_tid; prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); jbd2_might_wait_for_commit(journal); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } /* * Wait until running transaction transitions from T_SWITCH to T_FLUSH * state and new transaction can thus be started. The function releases * j_state_lock. */ static void wait_transaction_switching(journal_t *journal) __releases(journal->j_state_lock) { DEFINE_WAIT(wait); if (WARN_ON(!journal->j_running_transaction || journal->j_running_transaction->t_state != T_SWITCH)) { read_unlock(&journal->j_state_lock); return; } prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); read_unlock(&journal->j_state_lock); /* * We don't call jbd2_might_wait_for_commit() here as there's no * waiting for outstanding handles happening anymore in T_SWITCH state * and handling of reserved handles actually relies on that for * correctness. */ schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } static void sub_reserved_credits(journal_t *journal, int blocks) { atomic_sub(blocks, &journal->j_reserved_credits); wake_up(&journal->j_wait_reserved); } /* * Wait until we can add credits for handle to the running transaction. Called * with j_state_lock held for reading. Returns 0 if handle joined the running * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and * caller must retry. * * Note: because j_state_lock may be dropped depending on the return * value, we need to fake out sparse so ti doesn't complain about a * locking imbalance. Callers of add_transaction_credits will need to * make a similar accomodation. */ static int add_transaction_credits(journal_t *journal, int blocks, int rsv_blocks) __must_hold(&journal->j_state_lock) { transaction_t *t = journal->j_running_transaction; int needed; int total = blocks + rsv_blocks; /* * If the current transaction is locked down for commit, wait * for the lock to be released. */ if (t->t_state != T_RUNNING) { WARN_ON_ONCE(t->t_state >= T_FLUSH); wait_transaction_locked(journal); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } /* * If there is not enough space left in the log to write all * potential buffers requested by this operation, we need to * stall pending a log checkpoint to free some more log space. */ needed = atomic_add_return(total, &t->t_outstanding_credits); if (needed > journal->j_max_transaction_buffers) { /* * If the current transaction is already too large, * then start to commit it: we can then go back and * attach this handle to a new transaction. */ atomic_sub(total, &t->t_outstanding_credits); /* * Is the number of reserved credits in the current transaction too * big to fit this handle? Wait until reserved credits are freed. */ if (atomic_read(&journal->j_reserved_credits) + total > journal->j_max_transaction_buffers) { read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } wait_transaction_locked(journal); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } /* * The commit code assumes that it can get enough log space * without forcing a checkpoint. This is *critical* for * correctness: a checkpoint of a buffer which is also * associated with a committing transaction creates a deadlock, * so commit simply cannot force through checkpoints. * * We must therefore ensure the necessary space in the journal * *before* starting to dirty potentially checkpointed buffers * in the new transaction. */ if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } /* No reservation? We are done... */ if (!rsv_blocks) return 0; needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); /* We allow at most half of a transaction to be reserved */ if (needed > journal->j_max_transaction_buffers / 2) { sub_reserved_credits(journal, rsv_blocks); atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } return 0; } /* * start_this_handle: Given a handle, deal with any locking or stalling * needed to make sure that there is enough journal space for the handle * to begin. Attach the handle to a transaction and set up the * transaction's buffer credits. */ static int start_this_handle(journal_t *journal, handle_t *handle, gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; int blocks = handle->h_total_credits; int rsv_blocks = 0; unsigned long ts = jiffies; if (handle->h_rsv_handle) rsv_blocks = handle->h_rsv_handle->h_total_credits; /* * Limit the number of reserved credits to 1/2 of maximum transaction * size and limit the number of total credits to not exceed maximum * transaction size per operation. */ if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || (rsv_blocks + blocks > journal->j_max_transaction_buffers)) { printk(KERN_ERR "JBD2: %s wants too many credits " "credits:%d rsv_credits:%d max:%d\n", current->comm, blocks, rsv_blocks, journal->j_max_transaction_buffers); WARN_ON(1); return -ENOSPC; } alloc_transaction: /* * This check is racy but it is just an optimization of allocating new * transaction early if there are high chances we'll need it. If we * guess wrong, we'll retry or free unused transaction. */ if (!data_race(journal->j_running_transaction)) { /* * If __GFP_FS is not present, then we may be being called from * inside the fs writeback layer, so we MUST NOT fail. */ if ((gfp_mask & __GFP_FS) == 0) gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); if (!new_transaction) return -ENOMEM; } jbd2_debug(3, "New handle %p going live.\n", handle); /* * We need to hold j_state_lock until t_updates has been incremented, * for proper journal barrier handling */ repeat: read_lock(&journal->j_state_lock); BUG_ON(journal->j_flags & JBD2_UNMOUNT); if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { read_unlock(&journal->j_state_lock); jbd2_journal_free_transaction(new_transaction); return -EROFS; } /* * Wait on the journal's transaction barrier if necessary. Specifically * we allow reserved handles to proceed because otherwise commit could * deadlock on page writeback not being able to complete. */ if (!handle->h_reserved && journal->j_barrier_count) { read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); goto repeat; } if (!journal->j_running_transaction) { read_unlock(&journal->j_state_lock); if (!new_transaction) goto alloc_transaction; write_lock(&journal->j_state_lock); if (!journal->j_running_transaction && (handle->h_reserved || !journal->j_barrier_count)) { jbd2_get_transaction(journal, new_transaction); new_transaction = NULL; } write_unlock(&journal->j_state_lock); goto repeat; } transaction = journal->j_running_transaction; if (!handle->h_reserved) { /* We may have dropped j_state_lock - restart in that case */ if (add_transaction_credits(journal, blocks, rsv_blocks)) { /* * add_transaction_credits releases * j_state_lock on a non-zero return */ __release(&journal->j_state_lock); goto repeat; } } else { /* * We have handle reserved so we are allowed to join T_LOCKED * transaction and we don't have to check for transaction size * and journal space. But we still have to wait while running * transaction is being switched to a committing one as it * won't wait for any handles anymore. */ if (transaction->t_state == T_SWITCH) { wait_transaction_switching(journal); goto repeat; } sub_reserved_credits(journal, blocks); handle->h_reserved = 0; } /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; handle->h_requested_credits = blocks; handle->h_revoke_credits_requested = handle->h_revoke_credits; handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", handle, blocks, atomic_read(&transaction->t_outstanding_credits), jbd2_log_space_left(journal)); read_unlock(&journal->j_state_lock); current->journal_info = handle; rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); /* * Ensure that no allocations done while the transaction is open are * going to recurse back to the fs layer. */ handle->saved_alloc_context = memalloc_nofs_save(); return 0; } /* Allocate a new handle. This should probably be in a slab... */ static handle_t *new_handle(int nblocks) { handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; handle->h_total_credits = nblocks; handle->h_ref = 1; return handle; } handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, int revoke_records, gfp_t gfp_mask, unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; if (!journal) return ERR_PTR(-EROFS); if (handle) { J_ASSERT(handle->h_transaction->t_journal == journal); handle->h_ref++; return handle; } nblocks += DIV_ROUND_UP(revoke_records, journal->j_revoke_records_per_block); handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); if (rsv_blocks) { handle_t *rsv_handle; rsv_handle = new_handle(rsv_blocks); if (!rsv_handle) { jbd2_free_handle(handle); return ERR_PTR(-ENOMEM); } rsv_handle->h_reserved = 1; rsv_handle->h_journal = journal; handle->h_rsv_handle = rsv_handle; } handle->h_revoke_credits = revoke_records; err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { if (handle->h_rsv_handle) jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); return ERR_PTR(err); } handle->h_type = type; handle->h_line_no = line_no; trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); return handle; } EXPORT_SYMBOL(jbd2__journal_start); /** * jbd2_journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. * @nblocks: number of block buffer we might modify * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee * that much space. Additionally, if rsv_blocks > 0, we also create another * handle with rsv_blocks reserved blocks in the journal. This handle is * stored in h_rsv_handle. It is not attached to any particular transaction * and thus doesn't block transaction commit. If the caller uses this reserved * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() * on the parent handle will dispose the reserved one. Reserved handle has to * be converted to a normal handle using jbd2_journal_start_reserved() before * it can be used. * * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); sub_reserved_credits(journal, handle->h_total_credits); if (t) atomic_sub(handle->h_total_credits, &t->t_outstanding_credits); } void jbd2_journal_free_reserved(handle_t *handle) { journal_t *journal = handle->h_journal; /* Get j_state_lock to pin running transaction if it exists */ read_lock(&journal->j_state_lock); __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction); read_unlock(&journal->j_state_lock); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); /** * jbd2_journal_start_reserved() - start reserved handle * @handle: handle to start * @type: for handle statistics * @line_no: for handle statistics * * Start handle that has been previously reserved with jbd2_journal_reserve(). * This attaches @handle to the running transaction (or creates one if there's * not transaction running). Unlike jbd2_journal_start() this function cannot * block on journal commit, checkpointing, or similar stuff. It can block on * memory allocation or frozen journal though. * * Return 0 on success, non-zero on error - handle is freed in that case. */ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no) { journal_t *journal = handle->h_journal; int ret = -EIO; if (WARN_ON(!handle->h_reserved)) { /* Someone passed in normal handle? Just stop it. */ jbd2_journal_stop(handle); return ret; } /* * Usefulness of mixing of reserved and unreserved handles is * questionable. So far nobody seems to need it so just error out. */ if (WARN_ON(current->journal_info)) { jbd2_journal_free_reserved(handle); return ret; } handle->h_journal = NULL; /* * GFP_NOFS is here because callers are likely from writeback or * similarly constrained call sites */ ret = start_this_handle(journal, handle, GFP_NOFS); if (ret < 0) { handle->h_journal = journal; jbd2_journal_free_reserved(handle); return ret; } handle->h_type = type; handle->h_line_no = line_no; trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, handle->h_total_credits); return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); /** * jbd2_journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. * @revoke_records: number of revoke records to try to extend by. * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests * a credit for a number of buffer modifications in advance, but can * extend its credit if it needs more. * * jbd2_journal_extend tries to give the running handle more buffer credits. * It does not guarantee that allocation - this is a best-effort only. * The calling process MUST be able to deal cleanly with a failure to * extend here. * * Return 0 on success, non-zero on failure. * * return code < 0 implies an error * return code > 0 implies normal transaction-full status. */ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) { transaction_t *transaction = handle->h_transaction; journal_t *journal; int result; int wanted; if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; result = 1; read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ if (transaction->t_state != T_RUNNING) { jbd2_debug(3, "denied handle %p %d blocks: " "transaction not running\n", handle, nblocks); goto error_out; } nblocks += DIV_ROUND_UP( handle->h_revoke_credits_requested + revoke_records, journal->j_revoke_records_per_block) - DIV_ROUND_UP( handle->h_revoke_credits_requested, journal->j_revoke_records_per_block); wanted = atomic_add_return(nblocks, &transaction->t_outstanding_credits); if (wanted > journal->j_max_transaction_buffers) { jbd2_debug(3, "denied handle %p %d blocks: " "transaction too large\n", handle, nblocks); atomic_sub(nblocks, &transaction->t_outstanding_credits); goto error_out; } trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, transaction->t_tid, handle->h_type, handle->h_line_no, handle->h_total_credits, nblocks); handle->h_total_credits += nblocks; handle->h_requested_credits += nblocks; handle->h_revoke_credits += revoke_records; handle->h_revoke_credits_requested += revoke_records; result = 0; jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks); error_out: read_unlock(&journal->j_state_lock); return result; } static void stop_this_handle(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int revokes; J_ASSERT(journal_current_handle() == handle); J_ASSERT(atomic_read(&transaction->t_updates) > 0); current->journal_info = NULL; /* * Subtract necessary revoke descriptor blocks from handle credits. We * take care to account only for revoke descriptor blocks the * transaction will really need as large sequences of transactions with * small numbers of revokes are relatively common. */ revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits; if (revokes) { int t_revokes, revoke_descriptors; int rr_per_blk = journal->j_revoke_records_per_block; WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk) > handle->h_total_credits); t_revokes = atomic_add_return(revokes, &transaction->t_outstanding_revokes); revoke_descriptors = DIV_ROUND_UP(t_revokes, rr_per_blk) - DIV_ROUND_UP(t_revokes - revokes, rr_per_blk); handle->h_total_credits -= revoke_descriptors; } atomic_sub(handle->h_total_credits, &transaction->t_outstanding_credits); if (handle->h_rsv_handle) __jbd2_journal_unreserve_handle(handle->h_rsv_handle, transaction); if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); /* * Scope of the GFP_NOFS context is over here and so we can restore the * original alloc context. */ memalloc_nofs_restore(handle->saved_alloc_context); } /** * jbd2__journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested * @revoke_records: number of revoke record credits requested * @gfp_mask: memory allocation flags (for start_this_handle) * * Restart a handle for a multi-transaction filesystem * operation. * * If the jbd2_journal_extend() call above fails to grant new buffer credits * to a running handle, a call to jbd2_journal_restart will commit the * handle's transaction so far and reattach the handle to a new * transaction capable of guaranteeing the requested number of * credits. We preserve reserved handle if there's any attached to the * passed in handle. */ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal; tid_t tid; int need_to_start; int ret; /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) return 0; journal = transaction->t_journal; tid = transaction->t_tid; /* * First unlink the handle from its current transaction, and start the * commit on that. */ jbd2_debug(2, "restarting handle %p\n", handle); stop_this_handle(handle); handle->h_transaction = NULL; /* * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can * get rid of pointless j_state_lock traffic like this. */ read_lock(&journal->j_state_lock); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); handle->h_total_credits = nblocks + DIV_ROUND_UP(revoke_records, journal->j_revoke_records_per_block); handle->h_revoke_credits = revoke_records; ret = start_this_handle(journal, handle, gfp_mask); trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev, ret ? 0 : handle->h_transaction->t_tid, handle->h_type, handle->h_line_no, handle->h_total_credits); return ret; } EXPORT_SYMBOL(jbd2__journal_restart); int jbd2_journal_restart(handle_t *handle, int nblocks) { return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS); } EXPORT_SYMBOL(jbd2_journal_restart); /* * Waits for any outstanding t_updates to finish. * This is called with write j_state_lock held. */ void jbd2_journal_wait_updates(journal_t *journal) { DEFINE_WAIT(wait); while (1) { /* * Note that the running transaction can get freed under us if * this transaction is getting committed in * jbd2_journal_commit_transaction() -> * jbd2_journal_free_transaction(). This can only happen when we * release j_state_lock -> schedule() -> acquire j_state_lock. * Hence we should everytime retrieve new j_running_transaction * value (after j_state_lock release acquire cycle), else it may * lead to use-after-free of old freed transaction. */ transaction_t *transaction = journal->j_running_transaction; if (!transaction) break; prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); if (!atomic_read(&transaction->t_updates)) { finish_wait(&journal->j_wait_updates, &wait); break; } write_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_updates, &wait); write_lock(&journal->j_state_lock); } } /** * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * * This locks out any further updates from being started, and blocks * until all existing updates have completed, returning only once the * journal is in a quiescent state with no updates running. * * The journal lock should not be held on entry. */ void jbd2_journal_lock_updates(journal_t *journal) { jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); ++journal->j_barrier_count; /* Wait until there are no reserved handles */ if (atomic_read(&journal->j_reserved_credits)) { write_unlock(&journal->j_state_lock); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) == 0); write_lock(&journal->j_state_lock); } /* Wait until there are no running t_updates */ jbd2_journal_wait_updates(journal); write_unlock(&journal->j_state_lock); /* * We have now established a barrier against other normal updates, but * we also need to barrier against other jbd2_journal_lock_updates() calls * to make sure that we serialise special journal-locked operations * too. */ mutex_lock(&journal->j_barrier); } /** * jbd2_journal_unlock_updates () - release barrier * @journal: Journal to release the barrier on. * * Release a transaction barrier obtained with jbd2_journal_lock_updates(). * * Should be called without the journal lock held. */ void jbd2_journal_unlock_updates (journal_t *journal) { J_ASSERT(journal->j_barrier_count != 0); mutex_unlock(&journal->j_barrier); write_lock(&journal->j_state_lock); --journal->j_barrier_count; write_unlock(&journal->j_state_lock); wake_up_all(&journal->j_wait_transaction_locked); } static void warn_dirty_buffer(struct buffer_head *bh) { printk(KERN_WARNING "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). " "There's a risk of filesystem corruption in case of system " "crash.\n", bh->b_bdev, (unsigned long long)bh->b_blocknr); } /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ static void jbd2_freeze_jh_data(struct journal_head *jh) { char *source; struct buffer_head *bh = jh2bh(jh); J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); source = kmap_local_folio(bh->b_folio, bh_offset(bh)); /* Fire data frozen trigger just before we copy the data */ jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers); memcpy(jh->b_frozen_data, source, bh->b_size); kunmap_local(source); /* * Now that the frozen data is saved off, we need to store any matching * triggers. */ jh->b_frozen_triggers = jh->b_triggers; } /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior * transaction which we are still committing to disk, then we need to * make sure that we do not overwrite the old copy: we do copy-out to * preserve the copy going to disk. We also account the buffer against * the handle's metadata buffer credits (unless the buffer is already * part of the transaction, that is). * */ static int do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { struct buffer_head *bh; transaction_t *transaction = handle->h_transaction; journal_t *journal; int error; char *frozen_buffer = NULL; unsigned long start_lock, time_lock; journal = transaction->t_journal; jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: bh = jh2bh(jh); /* @@@ Need to check for errors here at some point. */ start_lock = jiffies; lock_buffer(bh); spin_lock(&jh->b_state_lock); /* If it takes too long to lock the buffer, trace it */ time_lock = jbd2_time_diff(start_lock, jiffies); if (time_lock > HZ/10) trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, jiffies_to_msecs(time_lock)); /* We now hold the buffer lock so it is safe to query the buffer * state. Is the buffer dirty? * * If so, there are two possibilities. The buffer may be * non-journaled, and undergoing a quite legitimate writeback. * Otherwise, it is journaled, and we don't expect dirty buffers * in that state (the buffers should be marked JBD_Dirty * instead.) So either the IO is being done under our own * control and this is a bug, or it's a third party IO such as * dump(8) (which may leave the buffer scheduled for read --- * ie. locked but not dirty) or tune2fs (which may actually have * the buffer dirtied, ugh.) */ if (buffer_dirty(bh) && jh->b_transaction) { warn_dirty_buffer(bh); /* * We need to clean the dirty flag and we must do it under the * buffer lock to be sure we don't race with running write-out. */ JBUFFER_TRACE(jh, "Journalling dirty buffer"); clear_buffer_dirty(bh); /* * The buffer is going to be added to BJ_Reserved list now and * nothing guarantees jbd2_journal_dirty_metadata() will be * ever called for it. So we need to set jbddirty bit here to * make sure the buffer is dirtied and written out when the * journaling machinery is done with it. */ set_buffer_jbddirty(bh); } error = -EROFS; if (is_handle_aborted(handle)) { spin_unlock(&jh->b_state_lock); unlock_buffer(bh); goto out; } error = 0; /* * The buffer is already part of this transaction if b_transaction or * b_next_transaction points to it */ if (jh->b_transaction == transaction || jh->b_next_transaction == transaction) { unlock_buffer(bh); goto done; } /* * this is the first time this transaction is touching this buffer, * reset the modified flag */ jh->b_modified = 0; /* * If the buffer is not journaled right now, we need to make sure it * doesn't get written to disk before the caller actually commits the * new data */ if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); JBUFFER_TRACE(jh, "file as BJ_Reserved"); /* * Make sure all stores to jh (b_modified, b_frozen_data) are * visible before attaching it to the running transaction. * Paired with barrier in jbd2_write_access_granted() */ smp_wmb(); spin_lock(&journal->j_list_lock); if (test_clear_buffer_dirty(bh)) { /* * Execute buffer dirty clearing and jh->b_transaction * assignment under journal->j_list_lock locked to * prevent bh being removed from checkpoint list if * the buffer is in an intermediate state (not dirty * and jh->b_transaction is NULL). */ JBUFFER_TRACE(jh, "Journalling dirty buffer"); set_buffer_jbddirty(bh); } __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); unlock_buffer(bh); goto done; } unlock_buffer(bh); /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); goto attach_next; } JBUFFER_TRACE(jh, "owned by older transaction"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); /* * There is one case we have to be very careful about. If the * committing transaction is currently writing this buffer out to disk * and has NOT made a copy-out, then we cannot modify the buffer * contents at all right now. The essence of copy-out is that it is * the extra copy, not the primary copy, which gets journaled. If the * primary copy is already going to disk then we cannot do copy-out * here. */ if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); spin_unlock(&jh->b_state_lock); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } /* * Only do the copy if the currently-owning transaction still needs it. * If buffer isn't on BJ_Metadata list, the committing transaction is * past that stage (here we use the fact that BH_Shadow is set under * bh_state lock together with refiling to BJ_Shadow list and at this * point we know the buffer doesn't have BH_Shadow set). * * Subtle point, though: if this is a get_undo_access, then we will be * relying on the frozen_data to contain the new value of the * committed_data record after the transaction, so we HAVE to force the * frozen_data copy in that case. */ if (jh->b_jlist == BJ_Metadata || force_copy) { JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); spin_unlock(&jh->b_state_lock); frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS | __GFP_NOFAIL); goto repeat; } jh->b_frozen_data = frozen_buffer; frozen_buffer = NULL; jbd2_freeze_jh_data(jh); } attach_next: /* * Make sure all stores to jh (b_modified, b_frozen_data) are visible * before attaching it to the running transaction. Paired with barrier * in jbd2_write_access_granted() */ smp_wmb(); jh->b_next_transaction = transaction; done: spin_unlock(&jh->b_state_lock); /* * If we are about to journal a buffer, then any revoke pending on it is * no longer valid */ jbd2_journal_cancel_revoke(handle, jh); out: if (unlikely(frozen_buffer)) /* It's usually NULL */ jbd2_free(frozen_buffer, bh->b_size); JBUFFER_TRACE(jh, "exit"); return error; } /* Fast check whether buffer is already attached to the required transaction */ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, bool undo) { struct journal_head *jh; bool ret = false; /* Dirty buffers require special handling... */ if (buffer_dirty(bh)) return false; /* * RCU protects us from dereferencing freed pages. So the checks we do * are guaranteed not to oops. However the jh slab object can get freed * & reallocated while we work with it. So we have to be careful. When * we see jh attached to the running transaction, we know it must stay * so until the transaction is committed. Thus jh won't be freed and * will be attached to the same bh while we run. However it can * happen jh gets freed, reallocated, and attached to the transaction * just after we get pointer to it from bh. So we have to be careful * and recheck jh still belongs to our bh before we return success. */ rcu_read_lock(); if (!buffer_jbd(bh)) goto out; /* This should be bh2jh() but that doesn't work with inline functions */ jh = READ_ONCE(bh->b_private); if (!jh) goto out; /* For undo access buffer must have data copied */ if (undo && !jh->b_committed_data) goto out; if (READ_ONCE(jh->b_transaction) != handle->h_transaction && READ_ONCE(jh->b_next_transaction) != handle->h_transaction) goto out; /* * There are two reasons for the barrier here: * 1) Make sure to fetch b_bh after we did previous checks so that we * detect when jh went through free, realloc, attach to transaction * while we were checking. Paired with implicit barrier in that path. * 2) So that access to bh done after jbd2_write_access_granted() * doesn't get reordered and see inconsistent state of concurrent * do_get_write_access(). */ smp_mb(); if (unlikely(jh->b_bh != bh)) goto out; ret = true; out: rcu_read_unlock(); return ret; } /** * jbd2_journal_get_write_access() - notify intent to modify a buffer * for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes * * Returns: error code or 0 on success. * * In full data journalling mode the buffer may be of type BJ_AsyncData, * because we're ``write()ing`` a buffer which is also part of a shared mapping. */ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { struct journal_head *jh; journal_t *journal; int rc; if (is_handle_aborted(handle)) return -EROFS; journal = handle->h_transaction->t_journal; if (jbd2_check_fs_dev_write_error(journal)) { /* * If the fs dev has writeback errors, it may have failed * to async write out metadata buffers in the background. * In this case, we could read old data from disk and write * it out again, which may lead to on-disk filesystem * inconsistency. Aborting journal can avoid it happen. */ jbd2_journal_abort(journal, -EIO); return -EIO; } if (jbd2_write_access_granted(handle, bh, false)) return 0; jh = jbd2_journal_add_journal_head(bh); /* We do not want to get caught playing with fields which the * log thread also manipulates. Make sure that the buffer * completes any outstanding IO before proceeding. */ rc = do_get_write_access(handle, jh, 0); jbd2_journal_put_journal_head(jh); return rc; } /* * When the user wants to journal a newly created buffer_head * (ie. getblk() returned a new buffer and we are going to populate it * manually rather than reading off disk), then we need to keep the * buffer_head locked until it has been completely filled with new * data. In this case, we should be able to make the assertion that * the bh is not already part of an existing transaction. * * The buffer should already be locked by the caller by this point. * There is no lock ranking violation: it was a newly created, * unlocked buffer beforehand. */ /** * jbd2_journal_get_create_access () - notify intent to use newly created bh * @handle: transaction to new buffer to * @bh: new buffer. * * Call this if you create a new bh. */ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; struct journal_head *jh = jbd2_journal_add_journal_head(bh); int err; jbd2_debug(5, "journal_head %p\n", jh); err = -EROFS; if (is_handle_aborted(handle)) goto out; journal = transaction->t_journal; err = 0; JBUFFER_TRACE(jh, "entry"); /* * The buffer may already belong to this transaction due to pre-zeroing * in the filesystem's new_block code. It may also be on the previous, * committing transaction's lists, but it HAS to be in Forget state in * that case: the transaction must have deleted the buffer for it to be * reused here. */ spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && jh->b_jlist == BJ_Forget))); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { /* * Previous jbd2_journal_forget() could have left the buffer * with jbddirty bit set because it was being committed. When * the commit finished, we've filed the buffer for * checkpointing and marked it dirty. Now we are reallocating * the buffer so the transaction freeing it must have * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "set next transaction"); spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } spin_unlock(&jh->b_state_lock); /* * akpm: I added this. ext3_alloc_branch can pick up new indirect * blocks which contain freed but then revoked metadata. We need * to cancel the revoke in case we end up freeing it yet again * and the reallocating as data - this would cause a second revoke, * which hits an assertion error. */ JBUFFER_TRACE(jh, "cancelling revoke"); jbd2_journal_cancel_revoke(handle, jh); out: jbd2_journal_put_journal_head(jh); return err; } /** * jbd2_journal_get_undo_access() - Notify intent to modify metadata with * non-rewindable consequences * @handle: transaction * @bh: buffer to undo * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses * this for freeing and allocating space, we have to make sure that we * do not reuse freed space until the deallocation has been committed, * since if we overwrote that space we would make the delete * un-rewindable in case of a crash. * * To deal with that, jbd2_journal_get_undo_access requests write access to a * buffer for parts of non-rewindable operations such as delete * operations on the bitmaps. The journaling code must keep a copy of * the buffer's contents prior to the undo_access call until such time * as we know that the buffer has definitely been committed to disk. * * We never need to know which transaction the committed data is part * of, buffers touched here are guaranteed to be dirtied later and so * will be committed to a new transaction in due course, at which point * we can discard the old committed data pointer. * * Returns error number or 0 on success. */ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) { int err; struct journal_head *jh; char *committed_data = NULL; if (is_handle_aborted(handle)) return -EROFS; if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done * atomically wrt. completion of any outstanding commits. */ err = do_get_write_access(handle, jh, 1); if (err) goto out; repeat: if (!jh->b_committed_data) committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS|__GFP_NOFAIL); spin_lock(&jh->b_state_lock); if (!jh->b_committed_data) { /* Copy out the current buffer contents into the * preserved, committed copy. */ JBUFFER_TRACE(jh, "generate b_committed data"); if (!committed_data) { spin_unlock(&jh->b_state_lock); goto repeat; } jh->b_committed_data = committed_data; committed_data = NULL; memcpy(jh->b_committed_data, bh->b_data, bh->b_size); } spin_unlock(&jh->b_state_lock); out: jbd2_journal_put_journal_head(jh); if (unlikely(committed_data)) jbd2_free(committed_data, bh->b_size); return err; } /** * jbd2_journal_set_triggers() - Add triggers for commit writeout * @bh: buffer to trigger on * @type: struct jbd2_buffer_trigger_type containing the trigger(s). * * Set any triggers on this journal_head. This is always safe, because * triggers for a committing buffer will be saved off, and triggers for * a running transaction will match the buffer in that transaction. * * Call with NULL to clear the triggers. */ void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { struct journal_head *jh = jbd2_journal_grab_journal_head(bh); if (WARN_ON_ONCE(!jh)) return; jh->b_triggers = type; jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); if (!triggers || !triggers->t_frozen) return; triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, struct jbd2_buffer_trigger_type *triggers) { if (!triggers || !triggers->t_abort) return; triggers->t_abort(triggers, jh2bh(jh)); } /** * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark * * mark dirty metadata which needs to be journaled as part of the current * transaction. * * The buffer must have previously had jbd2_journal_get_write_access() * called so that it has a valid journal_head attached to the buffer * head. * * The buffer is placed on the transaction's metadata list and is marked * as belonging to the transaction. * * Returns error number or 0 on success. * * Special care needs to be taken if the buffer already belongs to the * current committing transaction (in which case we should have frozen * data present for that commit). In that case, we don't relink the * buffer: that only gets done when the old transaction finally * completes its commit. */ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; struct journal_head *jh; int ret = 0; if (!buffer_jbd(bh)) return -EUCLEAN; /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); jbd2_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is * crucial to catch bugs so let's do a reliable check until the * lockless handling is fully proven. */ if (data_race(jh->b_transaction != transaction && jh->b_next_transaction != transaction)) { spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); spin_unlock(&jh->b_state_lock); } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) pr_err("JBD2: assertion failure: h_type=%u " "h_line_no=%u block_no=%llu jlist=%u\n", handle->h_type, handle->h_line_no, (unsigned long long) bh->b_blocknr, jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); spin_unlock(&jh->b_state_lock); } goto out; } journal = transaction->t_journal; spin_lock(&jh->b_state_lock); if (is_handle_aborted(handle)) { /* * Check journal aborting with @jh->b_state_lock locked, * since 'jh->b_transaction' could be replaced with * 'jh->b_next_transaction' during old transaction * committing if journal aborted, which may fail * assertion on 'jh->b_frozen_data == NULL'. */ ret = -EROFS; goto out_unlock_bh; } if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part * of the transaction. This needs to be done * once a transaction -bzzz */ if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) { ret = -ENOSPC; goto out_unlock_bh; } jh->b_modified = 1; handle->h_total_credits--; } /* * fastpath, to avoid expensive locking. If this buffer is already * on the running transaction's metadata list there is nothing to do. * Nobody can take it off again because there is a handle open. * I _think_ we're OK here with SMP barriers - a mistaken decision will * result in this test being false, so we go in and take the locks. */ if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { JBUFFER_TRACE(jh, "fastpath"); if (unlikely(jh->b_transaction != journal->j_running_transaction)) { printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_running_transaction (%p, %u)\n", journal->j_devname, (unsigned long long) bh->b_blocknr, jh->b_transaction, jh->b_transaction ? jh->b_transaction->t_tid : 0, journal->j_running_transaction, journal->j_running_transaction ? journal->j_running_transaction->t_tid : 0); ret = -EINVAL; } goto out_unlock_bh; } set_buffer_jbddirty(bh); /* * Metadata already on the current transaction list doesn't * need to be filed. Metadata on another transaction's list must * be committing, and will be refiled once the commit completes: * leave it alone for now. */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); if (unlikely(((jh->b_transaction != journal->j_committing_transaction)) || (jh->b_next_transaction != transaction))) { printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " "bad jh for block %llu: " "transaction (%p, %u), " "jh->b_transaction (%p, %u), " "jh->b_next_transaction (%p, %u), jlist %u\n", journal->j_devname, (unsigned long long) bh->b_blocknr, transaction, transaction->t_tid, jh->b_transaction, jh->b_transaction ? jh->b_transaction->t_tid : 0, jh->b_next_transaction, jh->b_next_transaction ? jh->b_next_transaction->t_tid : 0, jh->b_jlist); WARN_ON(1); ret = -EINVAL; } /* And this case is illegal: we can't reuse another * transaction's data buffer, ever. */ goto out_unlock_bh; } /* That test should have eliminated the following case: */ J_ASSERT_JH(jh, jh->b_frozen_data == NULL); JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: spin_unlock(&jh->b_state_lock); out: JBUFFER_TRACE(jh, "exit"); return ret; } /** * jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle * @bh: bh to 'forget' * * We can only do the bforget if there are no commits pending against the * buffer. If the buffer is dirty in the current running transaction we * can safely unlink it. * * bh may not be a journalled buffer at all - it may be a non-JBD * buffer which came off the hashtable. Check for this. * * Decrements bh->b_count by one. * * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; struct journal_head *jh; int drop_reserve = 0; int err = 0; int was_modified = 0; if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; BUFFER_TRACE(bh, "entry"); jh = jbd2_journal_grab_journal_head(bh); if (!jh) { __bforget(bh); return 0; } spin_lock(&jh->b_state_lock); /* Critical error: attempting to delete a bitmap buffer, maybe? * Don't do any jbd operations, and return an error. */ if (!J_EXPECT_JH(jh, !jh->b_committed_data, "inconsistent data on disk")) { err = -EIO; goto drop; } /* keep track of whether or not this transaction modified us */ was_modified = jh->b_modified; /* * The buffer's going from the transaction, we must drop * all references -bzzz */ jh->b_modified = 0; if (jh->b_transaction == transaction) { J_ASSERT_JH(jh, !jh->b_frozen_data); /* If we are forgetting a buffer which is already part * of this transaction, then we can just drop it from * the transaction immediately. */ clear_buffer_dirty(bh); clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); /* * we only want to drop a reference if this transaction * modified the buffer */ if (was_modified) drop_reserve = 1; /* * We are no longer going to journal this buffer. * However, the commit of this transaction is still * important to the buffer: the delete that we are now * processing might obsolete an old log entry, so by * committing, we can satisfy the buffer's checkpoint. * * So, if we have a checkpoint on the buffer, we should * now refile the buffer on our BJ_Forget list so that * we know to remove the checkpoint after we commit. */ spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction) { __jbd2_journal_temp_unlink_buffer(jh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); jbd2_journal_put_journal_head(jh); } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); /* ... but we CAN drop it from the new transaction through * marking the buffer as freed and set j_next_transaction to * the new transaction, so that not only the commit code * knows it should clear dirty bits when it is done with the * buffer, but also the buffer can be checkpointed only * after the new transaction commits. */ set_buffer_freed(bh); if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } else { J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified * the buffer */ if (was_modified) drop_reserve = 1; } } else { /* * Finally, if the buffer is not belongs to any * transaction, we can just drop it now if it has no * checkpoint. */ spin_lock(&journal->j_list_lock); if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "belongs to none transaction"); spin_unlock(&journal->j_list_lock); goto drop; } /* * Otherwise, if the buffer has been written to disk, * it is safe to remove the checkpoint and drop it. */ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) { spin_unlock(&journal->j_list_lock); goto drop; } /* * The buffer is still not written to disk, we should * attach this buffer to current transaction so that the * buffer can be checkpointed only after the current * transaction commits. */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } drop: __brelse(bh); spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ handle->h_total_credits++; } return err; } /** * jbd2_journal_stop() - complete a transaction * @handle: transaction to complete. * * All done for a particular handle. * * There is not much action needed here. We just return any remaining * buffer credits to the transaction and remove the handle. The only * complication is that we need to start a commit operation if the * filesystem is marked for synchronous update. * * jbd2_journal_stop itself will not usually return an error, but it may * do so in unusual circumstances. In particular, expect it to * return -EIO if a jbd2_journal_abort has been executed since the * transaction began. */ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal; int err = 0, wait_for_commit = 0; tid_t tid; pid_t pid; if (--handle->h_ref > 0) { jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, handle->h_ref); if (is_handle_aborted(handle)) return -EIO; return 0; } if (!transaction) { /* * Handle is already detached from the transaction so there is * nothing to do other than free the handle. */ memalloc_nofs_restore(handle->saved_alloc_context); goto free_and_exit; } journal = transaction->t_journal; tid = transaction->t_tid; if (is_handle_aborted(handle)) err = -EIO; jbd2_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, handle->h_sync, handle->h_requested_credits, (handle->h_requested_credits - handle->h_total_credits)); /* * Implement synchronous transaction batching. If the handle * was synchronous, don't force a commit immediately. Let's * yield and let another thread piggyback onto this * transaction. Keep doing that while new threads continue to * arrive. It doesn't cost much - we're about to run a commit * and sleep on IO anyway. Speeds up many-threaded, many-dir * operations by 30x or more... * * We try and optimize the sleep time against what the * underlying disk can do, instead of having a static sleep * time. This is useful for the case where our storage is so * fast that it is more optimal to go ahead and force a flush * and wait for the transaction to be committed than it is to * wait for an arbitrary amount of time for new writers to * join the transaction. We achieve this by measuring how * long it takes to commit a transaction, and compare it with * how long this transaction has been running, and if run time * < commit time then we sleep for the delta and commit. This * greatly helps super fast disks that would see slowdowns as * more threads started doing fsyncs. * * But don't do this if this process was the most recent one * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. * * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; if (handle->h_sync && journal->j_last_sync_writer != pid && journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; read_lock(&journal->j_state_lock); commit_time = journal->j_average_commit_time; read_unlock(&journal->j_state_lock); trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); commit_time = max_t(u64, commit_time, 1000*journal->j_min_batch_time); commit_time = min_t(u64, commit_time, 1000*journal->j_max_batch_time); if (trans_time < commit_time) { ktime_t expires = ktime_add_ns(ktime_get(), commit_time); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); } } if (handle->h_sync) transaction->t_synchronous_commit = 1; /* * If the handle is marked SYNC, we need to set another commit * going! We also want to force a commit if the transaction is too * old now. */ if (handle->h_sync || time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ jbd2_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ jbd2_log_start_commit(journal, tid); /* * Special case: JBD2_SYNC synchronous updates require us * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) wait_for_commit = 1; } /* * Once stop_this_handle() drops t_updates, the transaction could start * committing on us and eventually disappear. So we must not * dereference transaction pointer again after calling * stop_this_handle(). */ stop_this_handle(handle); if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); free_and_exit: if (handle->h_rsv_handle) jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); return err; } /* * * List management code snippets: various functions for manipulating the * transaction buffer lists. * */ /* * Append a buffer to a transaction list, given the transaction's list head * pointer. * * j_list_lock is held. * * jh->b_state_lock is held. */ static inline void __blist_add_buffer(struct journal_head **list, struct journal_head *jh) { if (!*list) { jh->b_tnext = jh->b_tprev = jh; *list = jh; } else { /* Insert at the tail of the list to preserve order */ struct journal_head *first = *list, *last = first->b_tprev; jh->b_tprev = last; jh->b_tnext = first; last->b_tnext = first->b_tprev = jh; } } /* * Remove a buffer from a transaction list, given the transaction's list * head pointer. * * Called with j_list_lock held, and the journal may not be locked. * * jh->b_state_lock is held. */ static inline void __blist_del_buffer(struct journal_head **list, struct journal_head *jh) { if (*list == jh) { *list = jh->b_tnext; if (*list == jh) *list = NULL; } jh->b_tprev->b_tnext = jh->b_tnext; jh->b_tnext->b_tprev = jh->b_tprev; } /* * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or * t_reserved_list. If the caller is holding onto a copy of one of these * pointers, it could go bad. Generally the caller needs to re-read the * pointer from the transaction_t. * * Called under j_list_lock. */ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) { struct journal_head **list = NULL; transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); lockdep_assert_held(&jh->b_state_lock); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); if (jh->b_jlist != BJ_None) J_ASSERT_JH(jh, transaction != NULL); switch (jh->b_jlist) { case BJ_None: return; case BJ_Metadata: transaction->t_nr_buffers--; J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); list = &transaction->t_buffers; break; case BJ_Forget: list = &transaction->t_forget; break; case BJ_Shadow: list = &transaction->t_shadow_list; break; case BJ_Reserved: list = &transaction->t_reserved_list; break; } __blist_del_buffer(list, jh); jh->b_jlist = BJ_None; if (transaction && is_journal_aborted(transaction->t_journal)) clear_buffer_jbddirty(bh); else if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh); /* Expose it to the VM */ } /* * Remove buffer from all transactions. The caller is responsible for dropping * the jh reference that belonged to the transaction. * * Called with bh_state lock and j_list_lock */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { J_ASSERT_JH(jh, jh->b_transaction != NULL); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); /* Get reference so that buffer cannot be freed before we unlock it */ get_bh(bh); spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); __brelse(bh); } /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @folio: Folio to detach data from. * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN * so try_to_free_buffers() can reap them. * * This function returns non-zero if we wish try_to_free_buffers() * to be called. We do this if the page is releasable by try_to_free_buffers(). * We also do it if the page has locked or dirty buffers and the caller wants * us to perform sync or async writeout. * * This complicates JBD locking somewhat. We aren't protected by the * BKL here. We wish to remove the buffer from its committing or * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer. * * This may *change* the value of transaction_t->t_datalist, so anyone * who looks at t_datalist needs to lock against this function. * * Even worse, someone may be doing a jbd2_journal_dirty_data on this * buffer. So we need to lock against that. jbd2_journal_dirty_data() * will come out of the lock with the buffer dirty, which makes it * ineligible for release here. * * Who else is affected by this? hmm... Really the only contender * is do_get_write_access() - it could be looking at the buffer while * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? * * Return false on failure, true on success */ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) { struct buffer_head *head; struct buffer_head *bh; bool ret = false; J_ASSERT(folio_test_locked(folio)); head = folio_buffers(folio); bh = head; do { struct journal_head *jh; /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); if (!jh) continue; spin_lock(&jh->b_state_lock); if (!jh->b_transaction && !jh->b_next_transaction) { spin_lock(&journal->j_list_lock); /* Remove written-back checkpointed metadata buffer */ if (jh->b_cp_transaction != NULL) jbd2_journal_try_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); } spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); ret = try_to_free_buffers(folio); busy: return ret; } /* * This buffer is no longer needed. If it is on an older transaction's * checkpoint list we need to record it on this transaction's forget list * to pin this buffer (and hence its checkpointing transaction) down until * this transaction commits. If the buffer isn't on a checkpoint list, we * release it. * Returns non-zero if JBD no longer has an interest in the buffer. * * Called under j_list_lock. * * Called under jh->b_state_lock. */ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) { int may_free = 1; struct buffer_head *bh = jh2bh(jh); if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); __jbd2_journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in * __journal_file_buffer */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); __jbd2_journal_unfile_buffer(jh); jbd2_journal_put_journal_head(jh); } return may_free; } /* * jbd2_journal_invalidate_folio * * This code is tricky. It has a number of cases to deal with. * * There are two invariants which this code relies on: * * i_size must be updated on disk before we start calling invalidate_folio * on the data. * * This is done in ext3 by defining an ext3_setattr method which * updates i_size before truncate gets going. By maintaining this * invariant, we can be sure that it is safe to throw away any buffers * attached to the current transaction: once the transaction commits, * we know that the data will not be needed. * * Note however that we can *not* throw away data belonging to the * previous, committing transaction! * * Any disk blocks which *are* part of the previous, committing * transaction (and which therefore cannot be discarded immediately) are * not going to be reused in the new running transaction * * The bitmap committed_data images guarantee this: any block which is * allocated in one transaction and removed in the next will be marked * as in-use in the committed_data bitmap, so cannot be reused until * the next transaction to delete the block commits. This means that * leaving committing buffers dirty is quite safe: the disk blocks * cannot be reallocated to a different file and so buffer aliasing is * not possible. * * * The above applies mainly to ordered data mode. In writeback mode we * don't make guarantees about the order in which data hits disk --- in * particular we don't guarantee that new dirty data is flushed before * transaction commit --- so it is always safe just to discard data * immediately in that mode. --sct */ /* * The journal_unmap_buffer helper function returns zero if the buffer * concerned remains pinned as an anonymous buffer belonging to an older * transaction. * * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; BUFFER_TRACE(bh, "entry"); /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are * holding the page lock. --sct */ jh = jbd2_journal_grab_journal_head(bh); if (!jh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ write_lock(&journal->j_state_lock); spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); /* * We cannot remove the buffer from checkpoint lists until the * transaction adding inode to orphan list (let's call it T) * is committed. Otherwise if the transaction changing the * buffer would be cleaned from the journal before T is * committed, a crash will cause that the correct contents of * the buffer will be lost. On the other hand we have to * clear the buffer dirty bit at latest at the moment when the * transaction marking the buffer as freed in the filesystem * structures is committed because from that moment on the * block can be reallocated and used by a different page. * Since the block hasn't been freed yet but the inode has * already been added to orphan list, it is safe for us to add * the buffer to BJ_Forget list of the newest transaction. * * Also we have to clear buffer_mapped flag of a truncated buffer * because the buffer_head may be attached to the page straddling * i_size (can happen only when blocksize < pagesize) and thus the * buffer_head can be reused when the file is extended again. So we end * up keeping around invalidated buffers attached to transactions' * BJ_Forget list just to stop checkpointing code from cleaning up * the transaction this buffer was modified in. */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it * has no checkpoint link, then we can zap it: * it's a writeback-mode buffer so we don't care * if it hits disk safely. */ if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "not on any transaction: zap"); goto zap_buffer; } if (!buffer_dirty(bh)) { /* bdflush has written it. We can drop it now */ __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } /* OK, it must be in the journal but still not * written fully to disk: it's metadata or * journaled data... */ if (journal->j_running_transaction) { /* ... and once the current transaction has * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); may_free = __dispose_buffer(jh, journal->j_running_transaction); goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have * passed into commit. We must attach this buffer to * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); may_free = __dispose_buffer(jh, journal->j_committing_transaction); goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ clear_buffer_jbddirty(bh); __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); /* * The buffer is committing, we simply cannot touch * it. If the page is straddling i_size we have to wait * for commit and try again. */ if (partial_page) { spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); /* Already zapped buffer? Nothing to do... */ if (!bh->b_bdev) return 0; return -EBUSY; } /* * OK, buffer won't be reachable after truncate. We just clear * b_modified to not confuse transaction credit accounting, and * set j_next_transaction to the running transaction (if there * is one) and mark buffer as freed so that commit code knows * it should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; jh->b_modified = 0; spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); return 0; } else { /* Good, the buffer belongs to the running transaction. * We are writing our own transaction's data, not any * previous one's, so it is safe to throw it away * (remember that we expect the filesystem to have set * i_size already for this truncate so recovery will not * expose the disk blocks we are discarding here.) */ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); JBUFFER_TRACE(jh, "on running transaction"); may_free = __dispose_buffer(jh, transaction); } zap_buffer: /* * This is tricky. Although the buffer is truncated, it may be reused * if blocksize < pagesize and it is attached to the page straddling * EOF. Since the buffer might have been added to BJ_Forget list of the * running transaction, journal_get_write_access() won't clear * b_modified and credit accounting gets confused. So clear b_modified * here. */ jh->b_modified = 0; spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); bh->b_bdev = NULL; return may_free; } /** * jbd2_journal_invalidate_folio() * @journal: journal to use for flush... * @folio: folio to flush * @offset: start of the range to invalidate * @length: length of the range to invalidate * * Reap page buffers containing data after in the specified range in page. * Can return -EBUSY if buffers are part of the committing transaction and * the page is straddling i_size. Caller then has to wait for current commit * and try again. */ int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio, size_t offset, size_t length) { struct buffer_head *head, *bh, *next; unsigned int stop = offset + length; unsigned int curr_off = 0; int partial_page = (offset || length < folio_size(folio)); int may_free = 1; int ret = 0; if (!folio_test_locked(folio)) BUG(); head = folio_buffers(folio); if (!head) return 0; BUG_ON(stop > folio_size(folio) || stop < length); /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ bh = head; do { unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; if (next_off > stop) return 0; if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); ret = journal_unmap_buffer(journal, bh, partial_page); unlock_buffer(bh); if (ret < 0) return ret; may_free &= ret; } curr_off = next_off; bh = next; } while (bh != head); if (!partial_page) { if (may_free && try_to_free_buffers(folio)) J_ASSERT(!folio_buffers(folio)); } return 0; } /* * File a buffer on the given transaction list. */ void __jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { struct journal_head **list = NULL; int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); lockdep_assert_held(&jh->b_state_lock); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == NULL); if (jh->b_transaction && jh->b_jlist == jlist) return; if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { /* * For metadata buffers, we track dirty bit in buffer_jbddirty * instead of buffer_dirty. We should not see a dirty bit set * here because we clear it in do_get_write_access but e.g. * tune2fs can modify the sb and set the dirty bit at any time * so we try to gracefully handle that. */ if (buffer_dirty(bh)) warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; } if (jh->b_transaction) __jbd2_journal_temp_unlink_buffer(jh); else jbd2_journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { case BJ_None: J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; break; case BJ_Forget: list = &transaction->t_forget; break; case BJ_Shadow: list = &transaction->t_shadow_list; break; case BJ_Reserved: list = &transaction->t_reserved_list; break; } __blist_add_buffer(list, jh); jh->b_jlist = jlist; if (was_dirty) set_buffer_jbddirty(bh); } void jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { spin_lock(&jh->b_state_lock); spin_lock(&transaction->t_journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, jlist); spin_unlock(&transaction->t_journal->j_list_lock); spin_unlock(&jh->b_state_lock); } /* * Remove a buffer from its current buffer list in preparation for * dropping it from its current transaction entirely. If the buffer has * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * * Called under j_list_lock * Called under jh->b_state_lock * * When this function returns true, there's no next transaction to refile to * and the caller has to drop jh reference through * jbd2_journal_put_journal_head(). */ bool __jbd2_journal_refile_buffer(struct journal_head *jh) { int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); lockdep_assert_held(&jh->b_state_lock); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); /* If the buffer is now unused, just drop it. */ if (jh->b_next_transaction == NULL) { __jbd2_journal_unfile_buffer(jh); return true; } /* * It has been modified by a later transaction: add it to the new * transaction's metadata list. */ was_dirty = test_clear_buffer_jbddirty(bh); __jbd2_journal_temp_unlink_buffer(jh); /* * b_transaction must be set, otherwise the new b_transaction won't * be holding jh reference */ J_ASSERT_JH(jh, jh->b_transaction != NULL); /* * We set b_transaction here because b_next_transaction will inherit * our jh reference and thus __jbd2_journal_file_buffer() must not * take a new one. */ WRITE_ONCE(jh->b_transaction, jh->b_next_transaction); WRITE_ONCE(jh->b_next_transaction, NULL); if (buffer_freed(bh)) jlist = BJ_Forget; else if (jh->b_modified) jlist = BJ_Metadata; else jlist = BJ_Reserved; __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) set_buffer_jbddirty(bh); return false; } /* * __jbd2_journal_refile_buffer() with necessary locking added. We take our * bh reference so that we can safely unlock bh. * * The jh and bh may be freed by this call. */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { bool drop; spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); drop = __jbd2_journal_refile_buffer(jh); spin_unlock(&jh->b_state_lock); spin_unlock(&journal->j_list_lock); if (drop) jbd2_journal_put_journal_head(jh); } /* * File inode in the inode list of the handle's transaction */ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, unsigned long flags, loff_t start_byte, loff_t end_byte) { transaction_t *transaction = handle->h_transaction; journal_t *journal; if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); spin_lock(&journal->j_list_lock); jinode->i_flags |= flags; if (jinode->i_dirty_end) { jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); } else { jinode->i_dirty_start = start_byte; jinode->i_dirty_end = end_byte; } /* Is inode already attached where we need it? */ if (jinode->i_transaction == transaction || jinode->i_next_transaction == transaction) goto done; /* * We only ever set this variable to 1 so the test is safe. Since * t_need_data_flush is likely to be set, we do the test to save some * cacheline bouncing */ if (!transaction->t_need_data_flush) transaction->t_need_data_flush = 1; /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { J_ASSERT(jinode->i_next_transaction == NULL); J_ASSERT(jinode->i_transaction == journal->j_committing_transaction); jinode->i_next_transaction = transaction; goto done; } /* Not on any transaction list... */ J_ASSERT(!jinode->i_next_transaction); jinode->i_transaction = transaction; list_add(&jinode->i_list, &transaction->t_inode_list); done: spin_unlock(&journal->j_list_lock); return 0; } int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { return jbd2_journal_file_inode(handle, jinode, JI_WRITE_DATA | JI_WAIT_DATA, start_byte, start_byte + length - 1); } int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, start_byte, start_byte + length - 1); } /* * File truncate and transaction commit interact with each other in a * non-trivial way. If a transaction writing data block A is * committing, we cannot discard the data by truncate until we have * written them. Otherwise if we crashed after the transaction with * write has committed but before the transaction with truncate has * committed, we could see stale data in block A. This function is a * helper to solve this problem. It starts writeout of the truncated * part in case it is in the committing transaction. * * Filesystem code must call this function when inode is journaled in * ordered mode before truncation happens and after the inode has been * placed on orphan list with the new inode size. The second condition * avoids the race that someone writes new data and we start * committing the transaction after this function has been called but * before a transaction for truncate is started (and furthermore it * allows us to optimize the case where the addition to orphan list * happens in the same transaction as write --- we don't have to write * any data in such case). */ int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *jinode, loff_t new_size) { transaction_t *inode_trans, *commit_trans; int ret = 0; /* This is a quick check to avoid locking if not necessary */ if (!jinode->i_transaction) goto out; /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started * a transaction adding the inode to orphan list */ read_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; read_unlock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); inode_trans = jinode->i_transaction; spin_unlock(&journal->j_list_lock); if (inode_trans == commit_trans) { ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); } out: return ret; } |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include <linux/init.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/device_cgroup.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/part_stat.h> #include <linux/uaccess.h> #include <linux/stat.h> #include "../fs/internal.h" #include "blk.h" /* Should we allow writing to mounted block devices? */ static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); struct bdev_inode { struct block_device bdev; struct inode vfs_inode; }; static inline struct bdev_inode *BDEV_I(struct inode *inode) { return container_of(inode, struct bdev_inode, vfs_inode); } struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } EXPORT_SYMBOL(I_BDEV); static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; int ret; spin_lock(&inode->i_lock); while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); ret = write_inode_now(inode, true); if (ret) pr_warn_ratelimited( "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", bdev, ret); spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); } /* Kill _all_ buffers and pagecache , dirty or not.. */ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; if (mapping_empty(mapping)) return; invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); } /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; if (mapping->nrpages) { invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } } EXPORT_SYMBOL(invalidate_bdev); /* * Drop all buffers & page cache for given bdev range. This function bails * with error if bdev has other exclusive owner (such as filesystem). */ int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend) { /* * If we don't hold exclusive handle for the device, upgrade to it * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ if (!(mode & BLK_OPEN_EXCL)) { int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); if (err) goto invalidate; } truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; invalidate: /* * Someone else has handle exclusively open. Try invalidating instead. * The 'end' argument is inclusive so the rounding is safe. */ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, lstart >> PAGE_SHIFT, lend >> PAGE_SHIFT); } static void set_init_blocksize(struct block_device *bdev) { unsigned int bsize = bdev_logical_block_size(bdev); loff_t size = i_size_read(bdev->bd_inode); while (bsize < PAGE_SIZE) { if (size & bsize) break; bsize <<= 1; } bdev->bd_inode->i_blkbits = blksize_bits(bsize); } int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ if (size < bdev_logical_block_size(bdev)) return -EINVAL; /* Don't change the size if it is same as current */ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { sync_blockdev(bdev); bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } return 0; } EXPORT_SYMBOL(set_blocksize); int sb_set_blocksize(struct super_block *sb, int size) { if (set_blocksize(sb->s_bdev, size)) return 0; /* If we get here, we know size is power of two * and it's value is between 512 and PAGE_SIZE */ sb->s_blocksize = size; sb->s_blocksize_bits = blksize_bits(size); return sb->s_blocksize; } EXPORT_SYMBOL(sb_set_blocksize); int sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) size = minsize; return sb_set_blocksize(sb, size); } EXPORT_SYMBOL(sb_min_blocksize); int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; return filemap_flush(bdev->bd_inode->i_mapping); } EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { if (!bdev) return 0; return filemap_write_and_wait(bdev->bd_inode->i_mapping); } EXPORT_SYMBOL(sync_blockdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) { return filemap_write_and_wait_range(bdev->bd_inode->i_mapping, lstart, lend); } EXPORT_SYMBOL(sync_blockdev_range); /** * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple * freeze requests arrive simultaneously. It counts up in bdev_freeze() and * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. * * Return: On success zero is returned, negative error code on failure. */ int bdev_freeze(struct block_device *bdev) { int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { error = bdev->bd_holder_ops->freeze(bdev); lockdep_assert_not_held(&bdev->bd_holder_lock); } else { mutex_unlock(&bdev->bd_holder_lock); error = sync_blockdev(bdev); } if (error) atomic_dec(&bdev->bd_fsfreeze_count); mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_freeze); /** * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * * Unlocks the filesystem and marks it writeable again after bdev_freeze(). * * Return: On success zero is returned, negative error code on failure. */ int bdev_thaw(struct block_device *bdev) { int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); /* * If this returns < 0 it means that @bd_fsfreeze_count was * already 0 and no decrement was performed. */ nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); if (nr_freeze < 0) goto out; error = 0; if (nr_freeze > 0) goto out; mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { error = bdev->bd_holder_ops->thaw(bdev); lockdep_assert_not_held(&bdev->bd_holder_lock); } else { mutex_unlock(&bdev->bd_holder_lock); } if (error) atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs */ static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); static struct kmem_cache *bdev_cachep __ro_after_init; static struct inode *bdev_alloc_inode(struct super_block *sb) { struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); return &ei->vfs_inode; } static void bdev_free_inode(struct inode *inode) { struct block_device *bdev = I_BDEV(inode); free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); if (!bdev_is_partition(bdev)) { if (bdev->bd_disk && bdev->bd_disk->bdi) bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); } if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(bdev->bd_dev)); kmem_cache_free(bdev_cachep, BDEV_I(inode)); } static void init_once(void *data) { struct bdev_inode *ei = data; inode_init_once(&ei->vfs_inode); } static void bdev_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); } static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, .drop_inode = generic_delete_inode, .evict_inode = bdev_evict_inode, }; static int bd_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_CGROUPWB; ctx->ops = &bdev_sops; return 0; } static struct file_system_type bd_type = { .name = "bdev", .init_fs_context = bd_init_fs_context, .kill_sb = kill_anon_super, }; struct super_block *blockdev_superblock __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; static struct vfsmount *bd_mnt __ro_after_init; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) { struct block_device *bdev; struct inode *inode; inode = new_inode(blockdev_superblock); if (!inode) return NULL; inode->i_mode = S_IFBLK; inode->i_rdev = 0; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); mutex_init(&bdev->bd_holder_lock); bdev->bd_partno = partno; bdev->bd_inode = inode; bdev->bd_queue = disk->queue; if (partno) bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio; else bdev->bd_has_submit_bio = false; bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); return NULL; } bdev->bd_disk = disk; return bdev; } void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } void bdev_add(struct block_device *bdev, dev_t dev) { if (bdev_stable_writes(bdev)) mapping_set_stable_writes(bdev->bd_inode->i_mapping); bdev->bd_dev = dev; bdev->bd_inode->i_rdev = dev; bdev->bd_inode->i_ino = dev; insert_inode_hash(bdev->bd_inode); } long nr_blockdev_pages(void) { struct inode *inode; long ret = 0; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) ret += inode->i_mapping->nrpages; spin_unlock(&blockdev_superblock->s_inode_list_lock); return ret; } /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest * @holder: holder trying to claim @bdev * @hops: holder ops * * Test whether @bdev can be claimed by @holder. * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ static bool bd_may_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); lockdep_assert_held(&bdev_lock); if (bdev->bd_holder) { /* * The same holder can always re-claim. */ if (bdev->bd_holder == holder) { if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) return false; return true; } return false; } /* * If the whole devices holder is set to bd_may_claim, a partition on * the device is claimed, but not the whole device. */ if (whole != bdev && whole->bd_holder && whole->bd_holder != bd_may_claim) return false; return true; } /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @holder: holder trying to claim @bdev * @hops: holder ops. * * Claim @bdev. This function fails if @bdev is already claimed by another * holder and waits if another claiming is in progress. return, the caller * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); if (WARN_ON_ONCE(!holder)) return -EINVAL; retry: mutex_lock(&bdev_lock); /* if someone else claimed, fail */ if (!bd_may_claim(bdev, holder, hops)) { mutex_unlock(&bdev_lock); return -EBUSY; } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); goto retry; } /* yay, all mine */ whole->bd_claiming = holder; mutex_unlock(&bdev_lock); return 0; } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); /* tell others that we're done */ BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL; wake_up_bit(&whole->bd_claiming, 0); } /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev * @hops: block device holder operations * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ static void bd_finish_claiming(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); mutex_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, holder, hops)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder */ whole->bd_holders++; whole->bd_holder = bd_may_claim; bdev->bd_holders++; mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = holder; bdev->bd_holder_ops = hops; mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); } /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be * also used when exclusive open is not actually desired and we just needed * to block other exclusive openers for a while. */ void bd_abort_claiming(struct block_device *bdev, void *holder) { mutex_lock(&bdev_lock); bd_clear_claiming(bdev_whole(bdev), holder); mutex_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); bool unblock = false; /* * Release a claim on the device. The holder fields are protected with * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. */ mutex_lock(&bdev_lock); WARN_ON_ONCE(bdev->bd_holder != holder); WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--whole->bd_holders < 0); if (!bdev->bd_holders) { mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = NULL; bdev->bd_holder_ops = NULL; mutex_unlock(&bdev->bd_holder_lock); if (bdev->bd_write_holder) unblock = true; } if (!whole->bd_holders) whole->bd_holder = NULL; mutex_unlock(&bdev_lock); /* * If this was the last claim, remove holder link and unblock evpoll if * it was a write holder. */ if (unblock) { disk_unblock_events(bdev->bd_disk); bdev->bd_write_holder = false; } } static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); bdev_write_inode(bdev); } static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret; if (disk->fops->open) { ret = disk->fops->open(disk, mode); if (ret) { /* avoid ghost partitions on a removed medium */ if (ret == -ENOMEDIUM && test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, true); return ret; } } if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); atomic_inc(&bdev->bd_openers); return 0; } static void blkdev_put_whole(struct block_device *bdev) { if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk); } static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; int ret; ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) goto out_blkdev_put; if (!atomic_read(&part->bd_openers)) { disk->open_partitions++; set_init_blocksize(part); } atomic_inc(&part->bd_openers); return 0; out_blkdev_put: blkdev_put_whole(bdev_whole(part)); return ret; } static void blkdev_put_part(struct block_device *part) { struct block_device *whole = bdev_whole(part); if (atomic_dec_and_test(&part->bd_openers)) { blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; } blkdev_put_whole(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; struct inode *inode; inode = ilookup(blockdev_superblock, dev); if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { blk_request_module(dev); inode = ilookup(blockdev_superblock, dev); if (inode) pr_warn_ratelimited( "block device autoloading is deprecated and will be removed.\n"); } if (!inode) return NULL; /* switch from the inode reference to a device mode one: */ bdev = &BDEV_I(inode)->bdev; if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) bdev = NULL; iput(inode); return bdev; } void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } static bool bdev_writes_blocked(struct block_device *bdev) { return bdev->bd_writers == -1; } static void bdev_block_writes(struct block_device *bdev) { bdev->bd_writers = -1; } static void bdev_unblock_writes(struct block_device *bdev) { bdev->bd_writers = 0; } static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) { if (bdev_allow_write_mounted) return true; /* Writes blocked? */ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) return false; if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) return false; return true; } static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) { if (bdev_allow_write_mounted) return; /* Claim exclusive or shared write access. */ if (mode & BLK_OPEN_RESTRICT_WRITES) bdev_block_writes(bdev); else if (mode & BLK_OPEN_WRITE) bdev->bd_writers++; } static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode) { if (bdev_allow_write_mounted) return; /* Yield exclusive or shared write access. */ if (mode & BLK_OPEN_RESTRICT_WRITES) bdev_unblock_writes(bdev); else if (mode & BLK_OPEN_WRITE) bdev->bd_writers--; } /** * bdev_open_by_dev - open a block device by device number * @dev: device number of block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * @hops: holder operations * * Open the block device described by device number @dev. If @holder is not * %NULL, the block device is opened with exclusive access. Exclusive opens may * nest for the same @holder. * * Use this interface ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a device * number. Everything else should use bdev_open_by_path(). * * CONTEXT: * Might sleep. * * RETURNS: * Handle with a reference to the block_device on success, ERR_PTR(-errno) on * failure. */ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle), GFP_KERNEL); struct block_device *bdev; bool unblock_events = true; struct gendisk *disk; int ret; if (!handle) return ERR_PTR(-ENOMEM); ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) goto free_handle; /* Blocking writes requires exclusive opener */ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) { ret = -EINVAL; goto free_handle; } bdev = blkdev_get_no_open(dev); if (!bdev) { ret = -ENXIO; goto free_handle; } disk = bdev->bd_disk; if (holder) { mode |= BLK_OPEN_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) goto put_blkdev; } else { if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { ret = -EIO; goto put_blkdev; } } disk_block_events(disk); mutex_lock(&disk->open_mutex); ret = -ENXIO; if (!disk_live(disk)) goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; ret = -EBUSY; if (!bdev_may_open(bdev, mode)) goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; bdev_claim_write_access(bdev, mode); if (holder) { bd_finish_claiming(bdev, holder, hops); /* * Block event polling for write claims if requested. Any write * holder makes the write_holder state stick until all are * released. This is good enough and tracking individual * writeable reference is too fragile given the way @mode is * used in blkdev_get/put(). */ if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; } } mutex_unlock(&disk->open_mutex); if (unblock_events) disk_unblock_events(disk); handle->bdev = bdev; handle->holder = holder; handle->mode = mode; return handle; put_module: module_put(disk->fops->owner); abort_claiming: if (holder) bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); free_handle: kfree(handle); return ERR_PTR(ret); } EXPORT_SYMBOL(bdev_open_by_dev); /** * bdev_open_by_path - open a block device by name * @path: path to the block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * @hops: holder operations * * Open the block device described by the device file at @path. If @holder is * not %NULL, the block device is opened with exclusive access. Exclusive opens * may nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: * Handle with a reference to the block_device on success, ERR_PTR(-errno) on * failure. */ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct bdev_handle *handle; dev_t dev; int error; error = lookup_bdev(path, &dev); if (error) return ERR_PTR(error); handle = bdev_open_by_dev(dev, mode, holder, hops); if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) && bdev_read_only(handle->bdev)) { bdev_release(handle); return ERR_PTR(-EACCES); } return handle; } EXPORT_SYMBOL(bdev_open_by_path); void bdev_release(struct bdev_handle *handle) { struct block_device *bdev = handle->bdev; struct gendisk *disk = bdev->bd_disk; /* * Sync early if it looks like we're the last one. If someone else * opens the block device between now and the decrement of bd_openers * then we did a sync that we didn't need to, but that's not the end * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ if (atomic_read(&bdev->bd_openers) == 1) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); bdev_yield_write_access(bdev, handle->mode); if (handle->holder) bd_end_claim(bdev, handle->holder); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE * event. This is to ensure detection of media removal commanded * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); if (bdev_is_partition(bdev)) blkdev_put_part(bdev); else blkdev_put_whole(bdev); mutex_unlock(&disk->open_mutex); module_put(disk->fops->owner); blkdev_put_no_open(bdev); kfree(handle); } EXPORT_SYMBOL(bdev_release); /** * lookup_bdev() - Look up a struct block_device by name. * @pathname: Name of the block device in the filesystem. * @dev: Pointer to the block device's dev_t, if found. * * Lookup the block device's dev_t at @pathname in the current * namespace if possible and return it in @dev. * * Context: May sleep. * Return: 0 if succeeded, negative errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { struct inode *inode; struct path path; int error; if (!pathname || !*pathname) return -EINVAL; error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) return error; inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto out_path_put; error = -EACCES; if (!may_open_dev(&path)) goto out_path_put; *dev = inode->i_rdev; error = 0; out_path_put: path_put(&path); return error; } EXPORT_SYMBOL(lookup_bdev); /** * bdev_mark_dead - mark a block device as dead * @bdev: block device to operate on * @surprise: indicate a surprise removal * * Tell the file system that this devices or media is dead. If @surprise is set * to %true the device or media is already gone, if not we are preparing for an * orderly removal. * * This calls into the file system, which then typicall syncs out all dirty data * and writes back inodes and then invalidates any cached data in the inodes on * the file system. In addition we also invalidate the block device mapping. */ void bdev_mark_dead(struct block_device *bdev, bool surprise) { mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) bdev->bd_holder_ops->mark_dead(bdev, surprise); else { mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); } invalidate_bdev(bdev); } /* * New drivers should not use this directly. There are some drivers however * that needs this for historical reasons. For example, the DASD driver has * historically had a shutdown to offline mode that doesn't actually remove the * gendisk that otherwise looks a lot like a safe device removal. */ EXPORT_SYMBOL_GPL(bdev_mark_dead); void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || mapping->nrpages == 0) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); old_inode = inode; bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); if (!atomic_read(&bdev->bd_openers)) { ; /* skip */ } else if (wait) { /* * We keep the error status of individual mapping so * that applications can catch the writeback error using * fsync(2). See filemap_fdatawait_keep_errors() for * details. */ filemap_fdatawait_keep_errors(inode->i_mapping); } else { filemap_fdatawrite(inode->i_mapping); } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } /* * Handle STATX_DIOALIGN for block devices. * * Note that the inode passed to this is the inode of a block device node file, * not the block device's internal inode. Therefore it is *not* valid to use * I_BDEV() here; the block device has to be looked up by i_rdev instead. */ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) { struct block_device *bdev; bdev = blkdev_get_no_open(inode->i_rdev); if (!bdev) return; stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); stat->result_mask |= STATX_DIOALIGN; blkdev_put_no_open(bdev); } static int __init setup_bdev_allow_write_mounted(char *str) { if (kstrtobool(str, &bdev_allow_write_mounted)) pr_warn("Invalid option string for bdev_allow_write_mounted:" " '%s'\n", str); return 1; } __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); |
4160 5 3582 3082 3581 2722 76 2 795 18 129 3568 1449 150 519 3939 2917 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #ifndef _LINUX_BPF_VERIFIER_H #define _LINUX_BPF_VERIFIER_H 1 #include <linux/bpf.h> /* for enum bpf_reg_type */ #include <linux/btf.h> /* for struct btf and btf_id() */ #include <linux/filter.h> /* for MAX_BPF_STACK */ #include <linux/tnum.h> /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit * ensures that umax_value + (int)off + (int)size cannot overflow a u64. */ #define BPF_MAX_VAR_OFF (1 << 29) /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) /* size of tmp_str_buf in bpf_verifier. * we need at least 306 bytes to fit full stack mask representation * (in the "-8,-16,...,-512" form) */ #define TMP_STR_BUF_LEN 320 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that * "one of this state's descendants read this reg" (and therefore the reg is * relevant for states_equal() checks). * Write marks collect downwards and do not propagate; they record that "the * straight-line code that reached this state (from its parent) wrote this reg" * (and therefore that reads propagated from this state or its descendants * should not propagate to its parent). * A state with a write mark can receive read marks; it just won't propagate * them to its parent, since the write mark is a property, not of the state, * but of the link between it and its parent. See mark_reg_read() and * mark_stack_slot_read() in kernel/bpf/verifier.c. */ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; /* For every reg representing a map value or allocated object pointer, * we consider the tuple of (ptr, id) for them to be unique in verifier * context and conside them to not alias each other for the purposes of * tracking lock state. */ struct bpf_active_lock { /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, * there's no active lock held, and other fields have no * meaning. If non-NULL, it indicates that a lock is held and * id member has the reg->id of the register which can be >= 0. */ void *ptr; /* This will be reg->id */ u32 id; }; #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { BPF_ITER_STATE_INVALID, /* for non-first slot */ BPF_ITER_STATE_ACTIVE, BPF_ITER_STATE_DRAINED, }; struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; /* Fixed part of pointer offset, pointer types only */ s32 off; union { /* valid when type == PTR_TO_PACKET */ int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ struct { struct bpf_map *map_ptr; /* To distinguish map lookups from outer map * the map_uid is non-zero for registers * pointing to inner maps. */ u32 map_uid; }; /* for PTR_TO_BTF_ID */ struct { struct btf *btf; u32 btf_id; }; struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ u32 mem_size; u32 dynptr_id; /* for dynptr slices */ }; /* For dynptr stack slots */ struct { enum bpf_dynptr_type type; /* A dynptr is 16 bytes so it takes up 2 stack slots. * We need to track which slot is the first slot * to protect against cases where the user may try to * pass in an address starting at the second slot of the * dynptr. */ bool first_slot; } dynptr; /* For bpf_iter stack slots */ struct { /* BTF container and BTF type ID describing * struct bpf_iter_<type> of an iterator state */ struct btf *btf; u32 btf_id; /* packing following two fields to fit iter state into 16 bytes */ enum bpf_iter_state state:2; int depth:30; } iter; /* Max size from any of the above. */ struct { unsigned long raw1; unsigned long raw2; } raw; u32 subprogno; /* for PTR_TO_FUNC */ }; /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset * from the pointed-to object, and is shared with all bpf_reg_states * with the same id as us. */ struct tnum var_off; /* Used to determine if any memory access using this register will * result in a bad access. * These refer to the same value as var_off, not necessarily the actual * contents of the register. */ s64 smin_value; /* minimum possible (s64)value */ s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ s32 s32_min_value; /* minimum possible (s32)value */ s32 s32_max_value; /* maximum possible (s32)value */ u32 u32_min_value; /* minimum possible (u32)value */ u32 u32_max_value; /* maximum possible (u32)value */ /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. * For stack slots that are dynptrs, this is used to track references to * the dynptr to determine proper reference freeing. * Similarly to dynptrs, we use ID to track "belonging" of a reference * to a specific instance of bpf_iter. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and * bpf_tcp_sock(). * * Consider the following where "sk" is a reference counted * pointer returned from "sk = bpf_sk_lookup_tcp();": * * 1: sk = bpf_sk_lookup_tcp(); * 2: if (!sk) { return 0; } * 3: fullsock = bpf_sk_fullsock(sk); * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } * 5: tp = bpf_tcp_sock(fullsock); * 6: if (!tp) { bpf_sk_release(sk); return 0; } * 7: bpf_sk_release(sk); * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain * * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and * "tp" ptr should be invalidated also. In order to do that, * the reg holding "fullsock" and "sk" need to remember * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id * such that the verifier can reset all regs which have * ref_obj_id matching the sk_reg->id. * * sk_reg->ref_obj_id is set to sk_reg->id at line 1. * sk_reg->id will stay as NULL-marking purpose only. * After NULL-marking is done, sk_reg->id can be reset to 0. * * After "fullsock = bpf_sk_fullsock(sk);" at line 3, * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. * * After "tp = bpf_tcp_sock(fullsock);" at line 5, * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id * which is the same as sk_reg->ref_obj_id. * * From the verifier perspective, if sk, fullsock and tp * are not NULL, they are the same ptr with different * reg->type. In particular, bpf_sk_release(tp) is also * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; /* parentage chain for liveness checking */ struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' * is used which is an index in bpf_verifier_state->frame[] array * pointing to bpf_func_state. */ u32 frameno; /* Tracks subreg definition. The stored value is the insn_idx of the * writing insn. This is safe because subreg_def is used before any insn * patching which only happens after main verification finished. */ s32 subreg_def; enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ STACK_MISC, /* BPF program wrote some data into this slot */ STACK_ZERO, /* BPF program wrote constant zero */ /* A dynptr is stored in this stack slot. The type of dynptr * is stored in bpf_stack_state->spilled_ptr.dynptr.type */ STACK_DYNPTR, STACK_ITER, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ #define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \ (1 << BPF_REG_3) | (1 << BPF_REG_4) | \ (1 << BPF_REG_5)) #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) struct bpf_stack_state { struct bpf_reg_state spilled_ptr; u8 slot_type[BPF_REG_SIZE]; }; struct bpf_reference_state { /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). */ int id; /* Instruction where the allocation of this reference occurred. This * is used purely to inform the user of a reference leak. */ int insn_idx; /* There can be a case like: * main (frame 0) * cb (frame 1) * func (frame 3) * cb (frame 4) * Hence for frame 4, if callback_ref just stored boolean, it would be * impossible to distinguish nested callback refs. Hence store the * frameno and compare that to callback_ref in check_reference_leak when * exiting a callback function. */ int callback_ref; }; struct bpf_retval_range { s32 minval; s32 maxval; }; /* state of the program: * type of all registers and stack info */ struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; /* index of call instruction that called into this func */ int callsite; /* stack frame number of this function state from pov of * enclosing bpf_verifier_state. * 0 = main function, 1 = first callee. */ u32 frameno; /* subprog number == index within subprog_info * zero == main subprog */ u32 subprogno; /* Every bpf_timer_start will increment async_entry_cnt. * It's used to distinguish: * void foo(void) { for(;;); } * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; struct bpf_retval_range callback_ret_range; bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; /* For callback calling functions that limit number of possible * callback executions (e.g. bpf_loop) keeps track of current * simulated iteration number. * Value in frame N refers to number of times callback with frame * N+1 was simulated, e.g. for the following call: * * bpf_loop(..., fn, ...); | suppose current frame is N * | fn would be simulated in frame N+1 * | number of simulations is tracked in frame N */ u32 callback_depth; /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; /* The state of the stack. Each element of the array describes BPF_REG_SIZE * (i.e. 8) bytes worth of stack memory. * stack[0] represents bytes [*(r10-8)..*(r10-1)] * stack[1] represents bytes [*(r10-16)..*(r10-9)] * ... * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)] */ struct bpf_stack_state *stack; /* Size of the current stack, in bytes. The stack state is tracked below, in * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. */ int allocated_stack; }; #define MAX_CALL_FRAMES 8 /* instruction history flags, used in bpf_jmp_history_entry.flags field */ enum { /* instruction references stack slot through PTR_TO_STACK register; * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, * 8 bytes per slot, so slot index (spi) is [0, 63]) */ INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ INSN_F_SPI_MASK = 0x3f, /* 6 bits */ INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ }; static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); struct bpf_jmp_history_entry { u32 idx; /* insn idx can't be bigger than 1 million */ u32 prev_idx : 22; /* special flags, e.g., whether insn is doing register stack spill/load */ u32 flags : 10; }; /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; struct bpf_verifier_state *parent; /* * 'branches' field is the number of branches left to explore: * 0 - all possible paths from this state reached bpf_exit or * were safely pruned * 1 - at least one path is being explored. * This state hasn't reached bpf_exit * 2 - at least two paths are being explored. * This state is an immediate parent of two children. * One is fallthrough branch with branches==1 and another * state is pushed into stack (to be explored later) also with * branches==1. The parent of this state has branches==1. * The verifier state tree connected via 'parent' pointer looks like: * 1 * 1 * 2 -> 1 (first 'if' pushed into stack) * 1 * 2 -> 1 (second 'if' pushed into stack) * 1 * 1 * 1 bpf_exit. * * Once do_check() reaches bpf_exit, it calls update_branch_counts() * and the verifier state tree will look: * 1 * 1 * 2 -> 1 (first 'if' pushed into stack) * 1 * 1 -> 1 (second 'if' pushed into stack) * 0 * 0 * 0 bpf_exit. * After pop_stack() the do_check() will resume at second 'if'. * * If is_state_visited() sees a state with branches > 0 it means * there is a loop. If such state is exactly equal to the current state * it's an infinite loop. Note states_equal() checks for states * equivalency, so two states being 'states_equal' does not mean * infinite loop. The exact comparison is provided by * states_maybe_looping() function. It's a stronger pre-check and * much faster than states_equal(). * * This algorithm may not find all possible infinite loops or * loop iteration count may be too high. * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in. */ u32 branches; u32 insn_idx; u32 curframe; struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; /* If this state was ever pointed-to by other state's loop_entry field * this flag would be set to true. Used to avoid freeing such states * while they are still in use. */ bool used_as_loop_entry; /* first and last insn idx of this verifier state */ u32 first_insn_idx; u32 last_insn_idx; /* If this state is a part of states loop this field points to some * parent of this state such that: * - it is also a member of the same states loop; * - DFS states traversal starting from initial state visits loop_entry * state before this state. * Used to compute topmost loop entry for state loops. * State loops might appear because of open coded iterators logic. * See get_loop_entry() for more information. */ struct bpf_verifier_state *loop_entry; /* jmp history recorded from first to last. * backtracking is using it to go from last to first. * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ struct bpf_jmp_history_entry *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; }; #define bpf_get_spilled_reg(slot, frame, mask) \ (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ ((1 << frame->stack[slot].slot_type[0]) & (mask))) \ ? &frame->stack[slot].spilled_ptr : NULL) /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ #define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask); \ iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame, mask)) #define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ int ___i, ___j; \ for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ struct bpf_reg_state *___regs; \ __state = ___vstate->frame[___i]; \ ___regs = __state->regs; \ for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ __reg = &___regs[___j]; \ (void)(__expr); \ } \ bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \ if (!__reg) \ continue; \ (void)(__expr); \ } \ } \ }) /* Invoke __expr over regsiters in __vst, setting __state and __reg */ #define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr) /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; struct bpf_verifier_state_list *next; int miss_cnt, hit_cnt; }; struct bpf_loop_inline_state { unsigned int initialized:1; /* set to true upon first entry */ unsigned int fit_for_inline:1; /* true if callback function is the same * at each call and flags are always zero */ u32 callback_subprogno; /* valid when fit_for_inline is true */ }; /* Possible states for alu_state member. */ #define BPF_ALU_SANITIZE_SRC (1U << 0) #define BPF_ALU_SANITIZE_DST (1U << 1) #define BPF_ALU_NEG_VALUE (1U << 2) #define BPF_ALU_NON_POINTER (1U << 3) #define BPF_ALU_IMMEDIATE (1U << 4) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ unsigned long map_ptr_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ union { struct { struct btf *btf; u32 btf_id; /* btf_id for struct typed var */ }; u32 mem_size; /* mem_size for non-struct typed var */ }; } btf_var; /* if instruction is a call to bpf_loop this field tracks * the state of the relevant registers to make decision about inlining */ struct bpf_loop_inline_state loop_inline_state; }; union { /* remember the size of type passed to bpf_obj_new to rewrite R1 */ u64 obj_new_size; /* remember the offset of node field within type to rewrite */ u64 insert_off; }; struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool jmp_point; bool prune_point; /* ensure we check state equivalence and save state checkpoint and * this instruction, regardless of any heuristics */ bool force_checkpoint; /* true if instruction is a call to a helper function that * accepts callback function as a parameter. */ bool calls_callback; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ #define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */ #define BPF_VERIFIER_TMP_LOG_SIZE 1024 struct bpf_verifier_log { /* Logical start and end positions of a "log window" of the verifier log. * start_pos == 0 means we haven't truncated anything. * Once truncation starts to happen, start_pos + len_total == end_pos, * except during log reset situations, in which (end_pos - start_pos) * might get smaller than len_total (see bpf_vlog_reset()). * Generally, (end_pos - start_pos) gives number of useful data in * user log buffer. */ u64 start_pos; u64 end_pos; char __user *ubuf; u32 level; u32 len_total; u32 len_max; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 #define BPF_LOG_FIXED 8 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && log->level; } #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_arg_info { enum bpf_arg_type arg_type; union { u32 mem_size; }; }; struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; struct bpf_verifier_env; struct backtrack_state { struct bpf_verifier_env *env; u32 frame; u32 reg_masks[MAX_CALL_FRAMES]; u64 stack_masks[MAX_CALL_FRAMES]; }; struct bpf_id_pair { u32 old; u32 cur; }; struct bpf_idmap { u32 tmp_id_gen; struct bpf_id_pair map[BPF_ID_MAP_SIZE]; }; struct bpf_idset { u32 count; u32 ids[BPF_ID_MAP_SIZE]; }; /* single container for all structs * one verifier_env per bpf_check() call */ struct bpf_verifier_env { u32 insn_idx; u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ const struct bpf_verifier_ops *ops; struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; /* Allow access to uninitialized stack memory. Writes with fixed offset are * always allowed, so this refers to reads (with fixed or variable offset), * to writes with variable offset and to indirect (helper) accesses. */ bool allow_uninit_stack; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; bool seen_exception; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */ union { struct bpf_idmap idmap_scratch; struct bpf_idset idset_scratch; }; struct { int *insn_state; int *insn_stack; int cur_stack; } cfg; struct backtrack_state bt; struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ u32 prev_insn_processed, insn_processed; /* number of jmps, calls, exits analyzed so far */ u32 prev_jmps_processed, jmps_processed; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ u32 max_states_per_insn; /* total number of allocated verifier states */ u32 total_states; /* some states are freed during program analysis. * this is peak number of states. this number dominates kernel * memory consumption during verification */ u32 peak_states; /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; bpfptr_t fd_array; /* bit mask to keep track of whether a register has been accessed * since the last time the function state was printed */ u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) { return &env->prog->aux->func_info_aux[subprog]; } static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) { return &env->subprog_info[subprog]; } __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, char __user *log_buf, u32 log_size); void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; return cur->frame[cur->curframe]; } static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { return cur_func(env)->regs; } int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int bpf_prog_offload_finalize(struct bpf_verifier_env *env); void bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn); void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, struct btf *btf, u32 btf_id) { if (tgt_prog) return ((u64)tgt_prog->aux->id << 32) | btf_id; else return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } /* unpack the IDs from the key as constructed above */ static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) { if (obj_id) *obj_id = key >> 32; if (btf_id) *btf_id = key & 0x7FFFFFFF; } int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, u32 btf_id, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); int mark_chain_precision(struct bpf_verifier_env *env, int regno); #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ static inline u32 base_type(u32 type) { return type & BPF_BASE_TYPE_MASK; } /* extract flags from an extended type. See bpf_type_flag in bpf.h. */ static inline u32 type_flag(u32 type) { return type & ~BPF_BASE_TYPE_MASK; } /* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_EXT ? prog->aux->dst_prog->type : prog->type; } static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) { switch (resolve_prog_type(prog)) { case BPF_PROG_TYPE_TRACING: return prog->expected_attach_type != BPF_TRACE_ITER; case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_LSM: return false; default: return true; } } #define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED | NON_OWN_REF) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } static inline bool type_is_ptr_alloc_obj(u32 type) { return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; } static inline bool type_is_non_owning_ref(u32 type) { return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; } static inline bool type_is_pkt_pointer(enum bpf_reg_type type) { type = base_type(type); return type == PTR_TO_PACKET || type == PTR_TO_PACKET_META; } static inline bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || type == PTR_TO_TCP_SOCK || type == PTR_TO_XDP_SOCK; } static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; } static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) { env->scratched_stack_slots |= 1ULL << spi; } static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) { return (env->scratched_regs >> regno) & 1; } static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) { return (env->scratched_stack_slots >> regno) & 1; } static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) { return env->scratched_regs || env->scratched_stack_slots; } static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) { env->scratched_regs = 0U; env->scratched_stack_slots = 0ULL; } /* Used for printing the entire verifier state. */ static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) { env->scratched_regs = ~0U; env->scratched_stack_slots = ~0ULL; } const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); const char *iter_state_str(enum bpf_iter_state state); void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, bool print_all); void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); #endif /* _LINUX_BPF_VERIFIER_H */ |
1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 | // SPDX-License-Identifier: GPL-2.0 /* * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption * * Copyright (c) 2019, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <crypto/aead.h> #include <crypto/aes.h> #include <crypto/rng.h> #include "crypto.h" #include "msg.h" #include "bcast.h" #define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ #define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ #define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 #define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ /* * TIPC Key ids */ enum { KEY_MASTER = 0, KEY_MIN = KEY_MASTER, KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, }; /* * TIPC Crypto statistics */ enum { STAT_OK, STAT_NOK, STAT_ASYNC, STAT_ASYNC_OK, STAT_ASYNC_NOK, STAT_BADKEYS, /* tx only */ STAT_BADMSGS = STAT_BADKEYS, /* rx only */ STAT_NOKEYS, STAT_SWITCHES, MAX_STATS, }; /* TIPC crypto statistics' header */ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", "async_nok", "badmsgs", "nokeys", "switches"}; /* Max TFMs number per key */ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; /* Key exchange switch, default: on */ int sysctl_tipc_key_exchange_enabled __read_mostly = 1; /* * struct tipc_key - TIPC keys' status indicator * * 7 6 5 4 3 2 1 0 * +-----+-----+-----+-----+-----+-----+-----+-----+ * key: | (reserved)|passive idx| active idx|pending idx| * +-----+-----+-----+-----+-----+-----+-----+-----+ */ struct tipc_key { #define KEY_BITS (2) #define KEY_MASK ((1 << KEY_BITS) - 1) union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 pending:2, active:2, passive:2, /* rx only */ reserved:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:2, passive:2, /* rx only */ active:2, pending:2; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; u8 keys; }; }; /** * struct tipc_tfm - TIPC TFM structure to form a list of TFMs * @tfm: cipher handle/key * @list: linked list of TFMs */ struct tipc_tfm { struct crypto_aead *tfm; struct list_head list; }; /** * struct tipc_aead - TIPC AEAD key structure * @tfm_entry: per-cpu pointer to one entry in TFM list * @crypto: TIPC crypto owns this key * @cloned: reference to the source key in case cloning * @users: the number of the key users (TX/RX) * @salt: the key's SALT value * @authsize: authentication tag size (max = 16) * @mode: crypto mode is applied to the key * @hint: a hint for user key * @rcu: struct rcu_head * @key: the aead key * @gen: the key's generation * @seqno: the key seqno (cluster scope) * @refcnt: the key reference counter */ struct tipc_aead { #define TIPC_AEAD_HINT_LEN (5) struct tipc_tfm * __percpu *tfm_entry; struct tipc_crypto *crypto; struct tipc_aead *cloned; atomic_t users; u32 salt; u8 authsize; u8 mode; char hint[2 * TIPC_AEAD_HINT_LEN + 1]; struct rcu_head rcu; struct tipc_aead_key *key; u16 gen; atomic64_t seqno ____cacheline_aligned; refcount_t refcnt ____cacheline_aligned; } ____cacheline_aligned; /** * struct tipc_crypto_stats - TIPC Crypto statistics * @stat: array of crypto statistics */ struct tipc_crypto_stats { unsigned int stat[MAX_STATS]; }; /** * struct tipc_crypto - TIPC TX/RX crypto structure * @net: struct net * @node: TIPC node (RX) * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index * @key_gen: TX/RX key generation * @key: the key states * @skey_mode: session key's mode * @skey: received session key * @wq: common workqueue on TX crypto * @work: delayed work sched for TX/RX * @key_distr: key distributing state * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) * @timer2: general timer 2 (jiffies) * @working: the crypto is working or not * @key_master: flag indicates if master key exists * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @nokey: no key indication * @flags: combined flags field * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; u16 key_gen; struct tipc_key key; u8 skey_mode; struct tipc_aead_key *skey; struct workqueue_struct *wq; struct delayed_work work; #define KEY_DISTR_SCHED 1 #define KEY_DISTR_COMPL 2 atomic_t key_distr; u32 rekeying_intv; struct tipc_crypto_stats __percpu *stats; char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; union { struct { u8 working:1; u8 key_master:1; u8 legacy_user:1; u8 nokey: 1; }; u8 flags; }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; /* struct tipc_crypto_tx_ctx - TX context for callbacks */ struct tipc_crypto_tx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; struct tipc_media_addr dst; }; /* struct tipc_crypto_rx_ctx - RX context for callbacks */ struct tipc_crypto_rx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; }; static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); static inline void tipc_aead_put(struct tipc_aead *aead); static void tipc_aead_free(struct rcu_head *rp); static int tipc_aead_users(struct tipc_aead __rcu *aead); static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode); static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg); static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode); static void tipc_aead_encrypt_done(void *data, int err); static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b); static void tipc_aead_decrypt_done(void *data, int err); static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx); static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key); static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); static void tipc_crypto_do_cmd(struct net *net, int cmd); static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf); static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode); static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); static void tipc_crypto_work_tx(struct work_struct *work); static void tipc_crypto_work_rx(struct work_struct *work); static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define is_tx(crypto) (!(crypto)->node) #define is_rx(crypto) (!is_tx(crypto)) #define key_next(cur) ((cur) % KEY_MAX + 1) #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ struct tipc_aead *__tmp = rcu_dereference_protected((rcu_ptr), \ lockdep_is_held(lock)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ tipc_aead_put(__tmp); \ } while (0) #define tipc_crypto_key_detach(rcu_ptr, lock) \ tipc_aead_rcu_replace((rcu_ptr), NULL, lock) /** * tipc_aead_key_validate - Validate a AEAD user key * @ukey: pointer to user key data * @info: netlink info pointer */ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { int keylen; /* Check if algorithm exists */ if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)"); return -ENODEV; } /* Currently, we only support the "gcm(aes)" cipher algorithm */ if (strcmp(ukey->alg_name, "gcm(aes)")) { GENL_SET_ERR_MSG(info, "not supported yet the algorithm"); return -ENOTSUPP; } /* Check if key size is correct */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && keylen != TIPC_AES_GCM_KEY_SIZE_192 && keylen != TIPC_AES_GCM_KEY_SIZE_256)) { GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)"); return -EKEYREJECTED; } return 0; } /** * tipc_aead_key_generate - Generate new session key * @skey: input/output key with new content * * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_key_generate(struct tipc_aead_key *skey) { int rc = 0; /* Fill the key's content with a random value via RNG cipher */ rc = crypto_get_default_rng(); if (likely(!rc)) { rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, skey->keylen); crypto_put_default_rng(); } return rc; } static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) tmp = NULL; rcu_read_unlock(); return tmp; } static inline void tipc_aead_put(struct tipc_aead *aead) { if (aead && refcount_dec_and_test(&aead->refcnt)) call_rcu(&aead->rcu, tipc_aead_free); } /** * tipc_aead_free - Release AEAD key incl. all the TFMs in the list * @rp: rcu head pointer */ static void tipc_aead_free(struct rcu_head *rp) { struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); struct tipc_tfm *tfm_entry, *head, *tmp; if (aead->cloned) { tipc_aead_put(aead->cloned); } else { head = *get_cpu_ptr(aead->tfm_entry); put_cpu_ptr(aead->tfm_entry); list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { crypto_free_aead(tfm_entry->tfm); list_del(&tfm_entry->list); kfree(tfm_entry); } /* Free the head */ crypto_free_aead(head->tfm); list_del(&head->list); kfree(head); } free_percpu(aead->tfm_entry); kfree_sensitive(aead->key); kfree(aead); } static int tipc_aead_users(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; int users = 0; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) users = atomic_read(&tmp->users); rcu_read_unlock(); return users; } static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&tmp->users, 1, lim); rcu_read_unlock(); } static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); rcu_read_unlock(); } static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) { struct tipc_aead *tmp; int cur; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) { do { cur = atomic_read(&tmp->users); if (cur == val) break; } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); } rcu_read_unlock(); } /** * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it * @aead: the AEAD key pointer */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { struct tipc_tfm **tfm_entry; struct crypto_aead *tfm; tfm_entry = get_cpu_ptr(aead->tfm_entry); *tfm_entry = list_next_entry(*tfm_entry, list); tfm = (*tfm_entry)->tfm; put_cpu_ptr(tfm_entry); return tfm; } /** * tipc_aead_init - Initiate TIPC AEAD * @aead: returned new TIPC AEAD key handle pointer * @ukey: pointer to user key data * @mode: the key mode * * Allocate a (list of) new cipher transformation (TFM) with the specific user * key data if valid. The number of the allocated TFMs can be set via the sysfs * "net/tipc/max_tfms" first. * Also, all the other AEAD data are also initialized. * * Return: 0 if the initiation is successful, otherwise: < 0 */ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode) { struct tipc_tfm *tfm_entry, *head; struct crypto_aead *tfm; struct tipc_aead *tmp; int keylen, err, cpu; int tfm_cnt = 0; if (unlikely(*aead)) return -EEXIST; /* Allocate a new AEAD */ tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (unlikely(!tmp)) return -ENOMEM; /* The key consists of two parts: [AES-KEY][SALT] */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; /* Allocate per-cpu TFM entry pointer */ tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); if (!tmp->tfm_entry) { kfree_sensitive(tmp); return -ENOMEM; } /* Make a list of TFMs with the user key data */ do { tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); if (IS_ERR(tfm)) { err = PTR_ERR(tfm); break; } if (unlikely(!tfm_cnt && crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { crypto_free_aead(tfm); err = -ENOTSUPP; break; } err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); err |= crypto_aead_setkey(tfm, ukey->key, keylen); if (unlikely(err)) { crypto_free_aead(tfm); break; } tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); if (unlikely(!tfm_entry)) { crypto_free_aead(tfm); err = -ENOMEM; break; } INIT_LIST_HEAD(&tfm_entry->list); tfm_entry->tfm = tfm; /* First entry? */ if (!tfm_cnt) { head = tfm_entry; for_each_possible_cpu(cpu) { *per_cpu_ptr(tmp->tfm_entry, cpu) = head; } } else { list_add_tail(&tfm_entry->list, &head->list); } } while (++tfm_cnt < sysctl_tipc_max_tfms); /* Not any TFM is allocated? */ if (!tfm_cnt) { free_percpu(tmp->tfm_entry); kfree_sensitive(tmp); return err; } /* Form a hex string of some last bytes as the key's hint */ bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN, TIPC_AEAD_HINT_LEN); /* Initialize the other data */ tmp->mode = mode; tmp->cloned = NULL; tmp->authsize = TIPC_AES_GCM_TAG_SIZE; tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); if (!tmp->key) { tipc_aead_free(&tmp->rcu); return -ENOMEM; } memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); atomic_set(&tmp->users, 0); atomic64_set(&tmp->seqno, 0); refcount_set(&tmp->refcnt, 1); *aead = tmp; return 0; } /** * tipc_aead_clone - Clone a TIPC AEAD key * @dst: dest key for the cloning * @src: source key to clone from * * Make a "copy" of the source AEAD key data to the dest, the TFMs list is * common for the keys. * A reference to the source is hold in the "cloned" pointer for the later * freeing purposes. * * Note: this must be done in cluster-key mode only! * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) { struct tipc_aead *aead; int cpu; if (!src) return -ENOKEY; if (src->mode != CLUSTER_KEY) return -EINVAL; if (unlikely(*dst)) return -EEXIST; aead = kzalloc(sizeof(*aead), GFP_ATOMIC); if (unlikely(!aead)) return -ENOMEM; aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); if (unlikely(!aead->tfm_entry)) { kfree_sensitive(aead); return -ENOMEM; } for_each_possible_cpu(cpu) { *per_cpu_ptr(aead->tfm_entry, cpu) = *per_cpu_ptr(src->tfm_entry, cpu); } memcpy(aead->hint, src->hint, sizeof(src->hint)); aead->mode = src->mode; aead->salt = src->salt; aead->authsize = src->authsize; atomic_set(&aead->users, 0); atomic64_set(&aead->seqno, 0); refcount_set(&aead->refcnt, 1); WARN_ON(!refcount_inc_not_zero(&src->refcnt)); aead->cloned = src; *dst = aead; return 0; } /** * tipc_aead_mem_alloc - Allocate memory for AEAD request operations * @tfm: cipher handle to be registered with the request * @crypto_ctx_size: size of crypto context for callback * @iv: returned pointer to IV data * @req: returned pointer to AEAD request data * @sg: returned pointer to SG lists * @nsg: number of SG lists to be allocated * * Allocate memory to store the crypto context data, AEAD request, IV and SG * lists, the memory layout is as follows: * crypto_ctx || iv || aead_req || sg[] * * Return: the pointer to the memory areas in case of success, otherwise NULL */ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg) { unsigned int iv_size, req_size; unsigned int len; u8 *mem; iv_size = crypto_aead_ivsize(tfm); req_size = sizeof(**req) + crypto_aead_reqsize(tfm); len = crypto_ctx_size; len += iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += nsg * sizeof(**sg); mem = kmalloc(len, GFP_ATOMIC); if (!mem) return NULL; *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, __alignof__(struct scatterlist)); return (void *)mem; } /** * tipc_aead_encrypt - Encrypt a message * @aead: TIPC AEAD key for the message encryption * @skb: the input/output skb * @b: TIPC bearer where the message will be delivered after the encryption * @dst: the destination media address * @__dnode: TIPC dest node if "known" * * Return: * * 0 : if the encryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the encryption has failed */ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct crypto_aead *tfm = tipc_aead_tfm_next(aead); struct tipc_crypto_tx_ctx *tx_ctx; struct aead_request *req; struct sk_buff *trailer; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, len, tailen, nsg, rc; void *ctx; u32 salt; u8 *iv; /* Make sure message len at least 4-byte aligned */ len = ALIGN(skb->len, 4); tailen = len - skb->len + aead->authsize; /* Expand skb tail for authentication tag: * As for simplicity, we'd have made sure skb having enough tailroom * for authentication tag @skb allocation. Even when skb is nonlinear * but there is no frag_list, it should be still fine! * Otherwise, we must cow it to be a writable buffer with the tailroom. */ SKB_LINEAR_ASSERT(skb); if (tailen > skb_tailroom(skb)) { pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n", skb_tailroom(skb), tailen); } nsg = skb_cow_data(skb, tailen, &trailer); if (unlikely(nsg < 0)) { pr_err("TX: skb_cow_data() returned %d\n", nsg); return nsg; } pskb_put(skb, trailer, tailen); /* Allocate memory for the AEAD operation */ ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); goto exit; } /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] * In case we're in cluster-key mode, SALT is varied by xor-ing with * the source address (or w0 of id), otherwise with the dest address * if dest is known. */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (__dnode) salt ^= tipc_node_get_addr(__dnode); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_encrypt_done, skb); tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; tx_ctx->aead = aead; tx_ctx->bearer = b; memcpy(&tx_ctx->dst, dst, sizeof(*dst)); /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_encrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = tx_ctx->bearer; struct tipc_aead *aead = tx_ctx->aead; struct tipc_crypto *tx = aead->crypto; struct net *net = tx->net; switch (err) { case 0: this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); rcu_read_lock(); if (likely(test_bit(0, &b->up))) b->media->send_msg(net, skb, b, &tx_ctx->dst); else kfree_skb(skb); rcu_read_unlock(); break; case -EINPROGRESS: return; default: this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); kfree_skb(skb); break; } kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); } /** * tipc_aead_decrypt - Decrypt an encrypted message * @net: struct net * @aead: TIPC AEAD for the message decryption * @skb: the input/output skb * @b: TIPC bearer where the message has been received * * Return: * * 0 : if the decryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the decryption has failed */ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_crypto_rx_ctx *rx_ctx; struct aead_request *req; struct crypto_aead *tfm; struct sk_buff *unused; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, nsg, rc; void *ctx; u32 salt; u8 *iv; if (unlikely(!aead)) return -ENOKEY; nsg = skb_cow_data(skb, 0, &unused); if (unlikely(nsg < 0)) { pr_err("RX: skb_cow_data() returned %d\n", nsg); return nsg; } /* Allocate memory for the AEAD operation */ tfm = tipc_aead_tfm_next(aead); ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); goto exit; } /* Reconstruct IV: */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (ehdr->destined) salt ^= tipc_own_addr(net); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_decrypt_done, skb); rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; rx_ctx->aead = aead; rx_ctx->bearer = b; /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do decrypt */ rc = crypto_aead_decrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_decrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = rx_ctx->bearer; struct tipc_aead *aead = rx_ctx->aead; struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; struct net *net = aead->crypto->net; switch (err) { case 0: this_cpu_inc(stats->stat[STAT_ASYNC_OK]); break; case -EINPROGRESS: return; default: this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); break; } kfree(rx_ctx); tipc_crypto_rcv_complete(net, aead, b, &skb, err); if (likely(skb)) { if (likely(test_bit(0, &b->up))) tipc_rcv(net, skb, b); else kfree_skb(skb); } tipc_bearer_put(b); } static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) { return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; } /** * tipc_ehdr_validate - Validate an encryption message * @skb: the message buffer * * Return: "true" if this is a valid encryption message, otherwise "false" */ bool tipc_ehdr_validate(struct sk_buff *skb) { struct tipc_ehdr *ehdr; int ehsz; if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) return false; ehdr = (struct tipc_ehdr *)skb->data; if (unlikely(ehdr->version != TIPC_EVERSION)) return false; ehsz = tipc_ehdr_size(ehdr); if (unlikely(!pskb_may_pull(skb, ehsz))) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; return true; } /** * tipc_ehdr_build - Build TIPC encryption message header * @net: struct net * @aead: TX AEAD key to be used for the message encryption * @tx_key: key id used for the message encryption * @skb: input/output message skb * @__rx: RX crypto handle if dest is "known" * * Return: the header size if the building is successful, otherwise < 0 */ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_ehdr *ehdr; u32 user = msg_user(hdr); u64 seqno; int ehsz; /* Make room for encryption header */ ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; WARN_ON(skb_headroom(skb) < ehsz); ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); /* Obtain a seqno first: * Use the key seqno (= cluster wise) if dest is unknown or we're in * cluster key mode, otherwise it's better for a per-peer seqno! */ if (!__rx || aead->mode == CLUSTER_KEY) seqno = atomic64_inc_return(&aead-> |