Total coverage: 181909 (15%)of 1230309
61 888 39 33 602 602 10061 10039 10072 10045 10056 10042 10048 68 68 68 68 68 68 68 68 68 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> #include <linux/zswap.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_PAGE_ORDER 10 #else #define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) #define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) /* Defines the order for the number of pages that have a migrate type. */ #ifndef CONFIG_PAGE_BLOCK_MAX_ORDER #define PAGE_BLOCK_MAX_ORDER MAX_PAGE_ORDER #else #define PAGE_BLOCK_MAX_ORDER CONFIG_PAGE_BLOCK_MAX_ORDER #endif /* CONFIG_PAGE_BLOCK_MAX_ORDER */ /* * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_MAX_ORDER, * which defines the order for the number of pages that can have a migrate type */ #if (PAGE_BLOCK_MAX_ORDER > MAX_PAGE_ORDER) #error MAX_PAGE_ORDER must be >= PAGE_BLOCK_MAX_ORDER #endif /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. */ MIGRATE_CMA, __MIGRATE_TYPE_END = MIGRATE_CMA, #else __MIGRATE_TYPE_END = MIGRATE_HIGHATOMIC, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) /* * __dump_folio() in mm/debug.c passes a folio pointer to on-stack struct folio, * so folio_pfn() cannot be used and pfn is needed. */ # define is_migrate_cma_folio(folio, pfn) \ (get_pfnblock_migratetype(&folio->page, pfn) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false # define is_migrate_cma_folio(folio, pfn) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } /* * Check whether a migratetype can be merged with another migratetype. * * It is only mergeable when it can fall back to other migratetypes for * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. */ static inline bool migratetype_is_mergeable(int mt) { return mt < MIGRATE_PCPTYPES; } #define for_each_migratetype_order(order, type) \ for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define get_pageblock_migratetype(page) \ get_pfnblock_migratetype(page, page_to_pfn(page)) #define folio_migratetype(folio) \ get_pageblock_migratetype(&folio->page) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; struct pglist_data; #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_EVENT_ITEMS }; #else #define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, #ifdef CONFIG_UNACCEPTED_MEMORY NR_UNACCEPTED, #endif NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */ #ifdef CONFIG_IOMMU_SUPPORT NR_IOMMU_PAGES, /* # of pages allocated by IOMMU */ #endif #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ /** * Candidate pages for promotion based on hint fault latency. This * counter is used to control the promotion rate and adjust the hot * threshold. */ PGPROMOTE_CANDIDATE, /** * Not rate-limited (NRL) candidate pages for those can be promoted * without considering hot threshold because of enough free pages in * fast-tier node. These promotions bypass the regular hotness checks * and do NOT influence the promotion rate-limiter or * threshold-adjustment logic. * This is for statistics/monitoring purposes. */ PGPROMOTE_CANDIDATE_NRL, #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif NR_BALLOON_PAGES, NR_KERNEL_FILE_PAGES, NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the item should be printed in THPs (/proc/vmstat * currently prints number of anon, file and shmem THPs. But the item * is charged in pages). */ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || item == NR_SHMEM_PMDMAPPED || item == NR_FILE_PMDMAPPED; } /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define WORKINGSET_ANON 0 #define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { /* * An lruvec has many dirty pages backed by a congested BDI: * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. * It can be cleared by cgroup reclaim or kswapd. * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. * It can only be cleared by kswapd. * * Essentially, kswapd can unthrottle an lruvec throttled by cgroup * reclaim, but not vice versa. This only applies to the root cgroup. * The goal is to prevent cgroup reclaim on the root cgroup (e.g. * memory.reclaim) to unthrottle an unbalanced node (that was throttled * by kswapd). */ LRUVEC_CGROUP_CONGESTED, LRUVEC_NODE_CONGESTED, }; #endif /* !__GENERATING_BOUNDS_H */ /* * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * * After a folio is faulted in, the aging needs to check the accessed bit at * least twice before handing this folio over to the eviction. The first check * clears the accessed bit from the initial fault; the second check makes sure * this folio hasn't been used since then. This process, AKA second chance, * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two * generations are considered active; the rest of generations, if they exist, * are considered inactive. See lru_gen_is_active(). * * PG_active is always cleared while a folio is on one of lrugen->folios[] so * that the sliding window needs not to worry about it. And it's set again when * a folio considered active is isolated for non-reclaiming purposes, e.g., * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* * Each generation is divided into multiple tiers. A folio accessed N times * through file descriptors is in tier order_base_2(N). A folio in the first * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by * PG_workingset. A folio in any other tier (1<N<5) between the first and last * is marked by additional bits of LRU_REFS_WIDTH in folio->flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, * comparisons of refaulted/(evicted+protected) from the first tier and the rest * infer whether folios accessed multiple times through file descriptors are * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) /* * For folios accessed multiple times through file descriptors, * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily * promoted into the second oldest generation in the eviction path. And when * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is * only valid when PG_referenced is set. * * For folios accessed multiple times through page tables, folio_update_gen() * from a page table walk or lru_gen_set_refs() from a rmap walk sets * PG_referenced after the accessed bit is cleared for the first time. * Thereafter, those two paths set PG_workingset and promote folios to the * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. * * For both cases above, after PG_workingset is set on a folio, it remains until * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It * can be set again if lru_gen_test_recent() returns true upon a refault. */ #define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) struct lruvec; struct page_vma_mapped_walk; #ifdef CONFIG_LRU_GEN enum { LRU_GEN_ANON, LRU_GEN_FILE, }; enum { LRU_GEN_CORE, LRU_GEN_MM_WALK, LRU_GEN_NONLEAF_YOUNG, NR_LRU_GEN_CAPS }; #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) /* whether to keep historical stats from evicted generations */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS MAX_NR_GENS #else #define NR_HIST_GENS 1U #endif /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are * stored in min_seq[] separately for anon and file types so that they can be * incremented independently. Ideally min_seq[] are kept in sync when both anon * and file types are evictable. However, to adapt to situations like extreme * swappiness, they are allowed to be out of sync by at most * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; /* can only be modified under the LRU lock */ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; }; enum { MM_LEAF_TOTAL, /* total leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS }; /* double-buffering Bloom filters */ #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { /* synced with max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; /* where the last iteration ended before */ struct list_head *tail; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; }; struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; /* max_seq from lru_gen_folio: can be out of date */ unsigned long seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* to batch the mm stats */ int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; int swappiness; bool force_scan; }; /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins * to improve scalability. For each bin, the hlist_nulls is virtually divided * into three segments: the head, the tail and the default. * * An onlining memcg is added to the tail of a random bin in the old generation. * The eviction starts at the head of a random bin in the old generation. The * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes * the old generation, is incremented when all its bins become empty. * * There are four operations: * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; * 3. The first attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_TAIL; * 4. The second attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * * Notes: * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing * of their max_seq counters ensures the eventual fairness to all eligible * memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). * 2. There are only two valid generations: old (seq) and young (seq+1). * MEMCG_NR_GENS is set to three so that when reading the generation counter * locklessly, a stale value (seq-1) does not wraparound to young. */ #define MEMCG_NR_GENS 3 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { /* the per-node memcg generation counter */ unsigned long seq; /* each memcg has one lru_gen_folio per node */ unsigned long nr_memcgs[MEMCG_NR_GENS]; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; /* protects the above */ spinlock_t lock; }; void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { return false; } static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } #endif /* CONFIG_LRU_GEN */ struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; #ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, WMARK_PROMO, NR_WMARK }; /* * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Flags used in pcp->flags field. * * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the * previous page freeing. To avoid to drain PCP for an accident * high-order page freeing. * * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before * draining PCP for consecutive high-order pages freeing without * allocation if data cache slice of CPU is large enough. To reduce * zone lock contention and keep cache-hot pages reusing. */ #define PCPF_PREV_FREE_HIGH_ORDER BIT(0) #define PCPF_FREE_HIGH_BATCH BIT(1) struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int high_min; /* min high watermark */ int high_max; /* max high watermark */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ u8 alloc_factor; /* batch scaling factor during allocate */ #ifdef CONFIG_NUMA u8 expire; /* When 0, remote pagesets are drained */ #endif short free_count; /* consecutive free count */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; } ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; s8 stat_threshold; #endif #ifdef CONFIG_NUMA /* * Low priority inaccurate counters that are only folded * on demand. Use a large type to avoid the overhead of * folding during refresh_cpu_vm_stats. */ unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; unsigned long nr_free_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pages __percpu *per_cpu_pageset; struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access */ int pageset_high_min; int pageset_high_max; int pageset_batch; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * present_early_pages is present pages existing within the zone * located on memory available since early boot, excluding hotplugged * memory. * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/done(). Any reader who can't tolerant drift of * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; #if defined(CONFIG_MEMORY_HOTPLUG) unsigned long present_early_pages; #endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; /* To be called once the last page in the zone is accepted */ struct work_struct unaccepted_cleanup; #endif /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Pages to be freed when next trylock succeeds */ struct llist_head trylock_free_pages; /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; static inline unsigned long wmark_pages(const struct zone *z, enum zone_watermarks w) { return z->_watermark[w] + z->watermark_boost; } static inline unsigned long min_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_MIN); } static inline unsigned long low_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_LOW); } static inline unsigned long high_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_HIGH); } static inline unsigned long promo_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_PROMO); } static inline unsigned long zone_managed_pages(const struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_cma_pages(struct zone *zone) { #ifdef CONFIG_CMA return zone->cma_pages; #else return 0; #endif } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(const struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(const struct zone *zone) { return zone->spanned_pages == 0; } #ifndef BUILD_VDSO32_64 /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) #define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) #define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ SECTIONS_PGOFF : ZONES_PGOFF) #else #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ NODES_PGOFF : ZONES_PGOFF) #endif #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags) { ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT); return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type page_zonenum(const struct page *page) { return memdesc_zonenum(page->flags); } static inline enum zone_type folio_zonenum(const struct folio *folio) { return memdesc_zonenum(folio->flags); } #ifdef CONFIG_ZONE_DEVICE static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return memdesc_zonenum(mdf) == ZONE_DEVICE; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page); return page_folio(page)->pgmap; } /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different * pgmaps. Otherwise getting the pgmap of a given segment is not possible * without scanning the entire segment. This helper returns true either if * both pages are not zone device pages or both pages are zone device pages * with the same pgmap. */ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags)) return false; if (!memdesc_is_zone_device(a->flags)) return true; return page_pgmap(a) == page_pgmap(b); } extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return false; } static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { return true; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { return NULL; } #endif static inline bool is_zone_device_page(const struct page *page) { return memdesc_is_zone_device(page->flags); } static inline bool folio_is_zone_device(const struct folio *folio) { return memdesc_is_zone_device(folio->flags); } static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } static inline bool folio_is_zone_movable(const struct folio *folio) { return folio_zonenum(folio) == ZONE_MOVABLE; } #endif /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(const struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; /* * The array of struct pages for flatmem. * It must be declared for SPARSEMEM as well because there are configurations * that rely on that. */ extern struct page *mem_map; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. */ struct memory_failure_stats { /* * Number of raw pages poisoned. * Cases not accounted: memory outside kernel control, offline page, * arch-specific memory_failure (SGX), hwpoison_filter() filtered * error events, and unpoison actions from hwpoison_unpoison. */ unsigned long total; /* * Recovery results of poisoned raw pages handled by memory_failure, * in sync with mf_result. * total = ignored + failed + delayed + recovered. * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. */ unsigned long ignored; unsigned long failed; unsigned long delayed; unsigned long recovered; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; /* workqueues for throttling reclaim for different reasons. */ wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ #ifdef CONFIG_MEMORY_HOTPLUG struct mutex kswapd_lock; #endif struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsigned long nbp_rl_nr_cand; /* promote threshold in ms */ unsigned int nbp_threshold; /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif #ifdef CONFIG_MEMORY_FAILURE struct memory_failure_stats mf_stats; #endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE static inline bool zone_is_zone_device(const struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else static inline bool zone_is_zone_device(const struct zone *zone) { return false; } #endif /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(const struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(const struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA static inline int zone_to_nid(const struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(const struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(const struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } #ifdef CONFIG_ZONE_DMA bool has_managed_dma(void); #else static inline bool has_managed_dma(void) { return false; } #endif #ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } #else /* CONFIG_NUMA */ #include <asm/mmzone.h> #endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(const struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(const struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z: The cursor used as a starting point for the search * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. * * Return: the next zone at or below highest_zoneidx within the allowed * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist: The zonelist to search for a suitable zone * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. * * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone: The current zone in the iterator * @z: The current pointer within zonelist->_zonerefs being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone: The current zone in the iterator * @z: The current pointer within zonelist->zones being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) /* Whether the 'nodes' are all movable nodes */ static inline bool movable_only_nodes(nodemask_t *nodes) { struct zonelist *zonelist; struct zoneref *z; int nid; if (nodes_empty(*nodes)) return false; /* * We can chose arbitrary node from the nodemask to get a * zonelist as they are interlinked. We just need to find * at least one zone that can satisfy kernel allocations. */ nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); return (!zonelist_zone(z)) ? true : false; } #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { unsigned long root = SECTION_NR_TO_ROOT(nr); if (unlikely(root >= NR_SECTION_ROOTS)) return NULL; #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section || !mem_section[root]) return NULL; #endif return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available on all architectures. * However, we can exceed 6 bits on some other architectures except * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available * with the worst case of 64K pages on arm64) if we make sure the * exceeded bit is not applicable to powerpc. */ enum { SECTION_MARKED_PRESENT_BIT, SECTION_HAS_MEM_MAP_BIT, SECTION_IS_ONLINE_BIT, SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT SECTION_IS_VMEMMAP_PREINIT_BIT, #endif SECTION_MAP_LAST_BIT, }; #define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) #define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) #define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) #define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT #define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT) #endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(const struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else static inline int online_device_section(const struct mem_section *section) { return 0; } #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT static inline int preinited_vmemmap_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); } void sparse_vmemmap_init_nid_early(int nid); void sparse_vmemmap_init_nid_late(int nid); #else static inline int preinited_vmemmap_section(const struct mem_section *section) { return 0; } static inline void sparse_vmemmap_init_nid_early(int nid) { } static inline void sparse_vmemmap_init_nid_late(int nid) { } #endif static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); struct mem_section_usage *usage = READ_ONCE(ms->usage); return usage ? test_bit(idx, usage->subsection_map) : 0; } static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) { struct mem_section_usage *usage = READ_ONCE(ms->usage); int idx = subsection_map_index(*pfn); unsigned long bit; if (!usage) return false; if (test_bit(idx, usage->subsection_map)) return true; /* Find the next subsection that exists */ bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx); if (bit == SUBSECTIONS_PER_SECTION) return false; *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION); return true; } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) { return true; } #endif void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, unsigned long flags); #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN * @pfn: the page frame number to check * * Check if there is a valid memory map entry aka struct page for the @pfn. * Note, that availability of the memory map entry does not imply that * there is actual usable memory at that @pfn. The struct page may * represent a hole or an unusable page frame. * * Return: 1 for PFNs that have memory map entries and 0 otherwise */ static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; int ret; /* * Ensure the upper PAGE_SHIFT bits are clear in the * pfn. Else it might lead to false positives when * some of the upper bits are set, but the lower bits * match a valid pfn. */ if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) return 0; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); rcu_read_lock_sched(); if (!valid_section(ms)) { rcu_read_unlock_sched(); return 0; } /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ ret = early_section(ms) || pfn_section_valid(ms, pfn); rcu_read_unlock_sched(); return ret; } /* Returns end_pfn or higher if no valid PFN remaining in range */ static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn) { unsigned long nr = pfn_to_section_nr(pfn); rcu_read_lock_sched(); while (nr <= __highest_present_section_nr && pfn < end_pfn) { struct mem_section *ms = __pfn_to_section(pfn); if (valid_section(ms) && (early_section(ms) || pfn_section_first_valid(ms, &pfn))) { rcu_read_unlock_sched(); return pfn; } /* Nothing left in this section? Skip to next section */ nr++; pfn = section_nr_to_pfn(nr); } rcu_read_unlock_sched(); return end_pfn; } static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn) { pfn++; if (pfn >= end_pfn) return end_pfn; /* * Either every PFN within the section (or subsection for VMEMMAP) is * valid, or none of them are. So there's no point repeating the check * for every PFN; only call first_valid_pfn() again when crossing a * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)). */ if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ? PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK)) return pfn; return first_valid_pfn(pfn, end_pfn); } #define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn)); \ (_pfn) < (_end_pfn); \ (_pfn) = next_valid_pfn((_pfn), (_end_pfn))) #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start - 1); \ section_nr != -1; \ section_nr = next_present_section_nr(section_nr)) /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* * Fallback case for when the architecture provides its own pfn_valid() but * not a corresponding for_each_valid_pfn(). */ #ifndef for_each_valid_pfn #define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++) \ if (pfn_valid(_pfn)) #endif #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */
1055 40 888 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_FLOW_DISSECTOR_H #define _NET_FLOW_DISSECTOR_H #include <linux/types.h> #include <linux/in6.h> #include <linux/siphash.h> #include <linux/string.h> #include <uapi/linux/if_ether.h> #include <uapi/linux/pkt_cls.h> struct bpf_prog; struct net; struct sk_buff; /** * struct flow_dissector_key_control: * @thoff: Transport header offset * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_* * @flags: Key flags. * Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAG|ENCAPSULATION|F_*) */ struct flow_dissector_key_control { u16 thoff; u16 addr_type; u32 flags; }; /* The control flags are kept in sync with TCA_FLOWER_KEY_FLAGS_*, as those * flags are exposed to userspace in some error paths, ie. unsupported flags. */ enum flow_dissector_ctrl_flags { FLOW_DIS_IS_FRAGMENT = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_FIRST_FRAG = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, FLOW_DIS_F_TUNNEL_CSUM = TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM, FLOW_DIS_F_TUNNEL_DONT_FRAGMENT = TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT, FLOW_DIS_F_TUNNEL_OAM = TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, FLOW_DIS_F_TUNNEL_CRIT_OPT = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT, /* These flags are internal to the kernel */ FLOW_DIS_ENCAPSULATION = (TCA_FLOWER_KEY_FLAGS_MAX << 1), }; enum flow_dissect_ret { FLOW_DISSECT_RET_OUT_GOOD, FLOW_DISSECT_RET_OUT_BAD, FLOW_DISSECT_RET_PROTO_AGAIN, FLOW_DISSECT_RET_IPPROTO_AGAIN, FLOW_DISSECT_RET_CONTINUE, }; /** * struct flow_dissector_key_basic: * @n_proto: Network header protocol (eg. IPv4/IPv6) * @ip_proto: Transport header protocol (eg. TCP/UDP) * @padding: Unused */ struct flow_dissector_key_basic { __be16 n_proto; u8 ip_proto; u8 padding; }; struct flow_dissector_key_tags { u32 flow_label; }; struct flow_dissector_key_vlan { union { struct { u16 vlan_id:12, vlan_dei:1, vlan_priority:3; }; __be16 vlan_tci; }; __be16 vlan_tpid; __be16 vlan_eth_type; u16 padding; }; struct flow_dissector_mpls_lse { u32 mpls_ttl:8, mpls_bos:1, mpls_tc:3, mpls_label:20; }; #define FLOW_DIS_MPLS_MAX 7 struct flow_dissector_key_mpls { struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */ u8 used_lses; /* One bit set for each Label Stack Entry in use */ }; static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls, int lse_index) { mpls->used_lses |= 1 << lse_index; } #define FLOW_DIS_TUN_OPTS_MAX 255 /** * struct flow_dissector_key_enc_opts: * @data: tunnel option data * @len: length of tunnel option data * @dst_opt_type: tunnel option type */ struct flow_dissector_key_enc_opts { u8 data[FLOW_DIS_TUN_OPTS_MAX]; /* Using IP_TUNNEL_OPTS_MAX is desired * here but seems difficult to #include */ u8 len; u32 dst_opt_type; }; struct flow_dissector_key_keyid { __be32 keyid; }; /** * struct flow_dissector_key_ipv4_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv4_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ __be32 src; __be32 dst; }; /** * struct flow_dissector_key_ipv6_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv6_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ struct in6_addr src; struct in6_addr dst; }; /** * struct flow_dissector_key_tipc: * @key: source node address combined with selector */ struct flow_dissector_key_tipc { __be32 key; }; /** * struct flow_dissector_key_addrs: * @v4addrs: IPv4 addresses * @v6addrs: IPv6 addresses * @tipckey: TIPC key */ struct flow_dissector_key_addrs { union { struct flow_dissector_key_ipv4_addrs v4addrs; struct flow_dissector_key_ipv6_addrs v6addrs; struct flow_dissector_key_tipc tipckey; }; }; /** * struct flow_dissector_key_arp: * @sip: Sender IP address * @tip: Target IP address * @op: Operation * @sha: Sender hardware address * @tha: Target hardware address */ struct flow_dissector_key_arp { __u32 sip; __u32 tip; __u8 op; unsigned char sha[ETH_ALEN]; unsigned char tha[ETH_ALEN]; }; /** * struct flow_dissector_key_ports: * @ports: port numbers of Transport header * @src: source port number * @dst: destination port number */ struct flow_dissector_key_ports { union { __be32 ports; struct { __be16 src; __be16 dst; }; }; }; /** * struct flow_dissector_key_ports_range * @tp: port number from packet * @tp_min: min port number in range * @tp_max: max port number in range */ struct flow_dissector_key_ports_range { union { struct flow_dissector_key_ports tp; struct { struct flow_dissector_key_ports tp_min; struct flow_dissector_key_ports tp_max; }; }; }; /** * struct flow_dissector_key_icmp: * @type: ICMP type * @code: ICMP code * @id: Session identifier */ struct flow_dissector_key_icmp { struct { u8 type; u8 code; }; u16 id; }; /** * struct flow_dissector_key_eth_addrs: * @src: source Ethernet address * @dst: destination Ethernet address */ struct flow_dissector_key_eth_addrs { /* (dst,src) must be grouped, in the same way than in ETH header */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; }; /** * struct flow_dissector_key_tcp: * @flags: flags */ struct flow_dissector_key_tcp { __be16 flags; }; /** * struct flow_dissector_key_ip: * @tos: tos * @ttl: ttl */ struct flow_dissector_key_ip { __u8 tos; __u8 ttl; }; /** * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex * @ingress_iftype: ingress interface type * @l2_miss: packet did not match an L2 entry during forwarding */ struct flow_dissector_key_meta { int ingress_ifindex; u16 ingress_iftype; u8 l2_miss; }; /** * struct flow_dissector_key_ct: * @ct_state: conntrack state after converting with map * @ct_mark: conttrack mark * @ct_zone: conntrack zone * @ct_labels: conntrack labels */ struct flow_dissector_key_ct { u16 ct_state; u16 ct_zone; u32 ct_mark; u32 ct_labels[4]; }; /** * struct flow_dissector_key_hash: * @hash: hash value */ struct flow_dissector_key_hash { u32 hash; }; /** * struct flow_dissector_key_num_of_vlans: * @num_of_vlans: num_of_vlans value */ struct flow_dissector_key_num_of_vlans { u8 num_of_vlans; }; /** * struct flow_dissector_key_pppoe: * @session_id: pppoe session id * @ppp_proto: ppp protocol * @type: pppoe eth type */ struct flow_dissector_key_pppoe { __be16 session_id; __be16 ppp_proto; __be16 type; }; /** * struct flow_dissector_key_l2tpv3: * @session_id: identifier for a l2tp session */ struct flow_dissector_key_l2tpv3 { __be32 session_id; }; /** * struct flow_dissector_key_ipsec: * @spi: identifier for a ipsec connection */ struct flow_dissector_key_ipsec { __be32 spi; }; /** * struct flow_dissector_key_cfm * @mdl_ver: maintenance domain level (mdl) and cfm protocol version * @opcode: code specifying a type of cfm protocol packet * * See 802.1ag, ITU-T G.8013/Y.1731 * 1 2 * |7 6 5 4 3 2 1 0|7 6 5 4 3 2 1 0| * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | mdl | version | opcode | * +-----+---------+-+-+-+-+-+-+-+-+ */ struct flow_dissector_key_cfm { u8 mdl_ver; u8 opcode; }; #define FLOW_DIS_CFM_MDL_MASK GENMASK(7, 5) #define FLOW_DIS_CFM_MDL_MAX 7 enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */ FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */ FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */ FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */ FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */ FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */ FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */ FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */ FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */ FLOW_DISSECTOR_KEY_MAX, }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) #define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP BIT(3) struct flow_dissector_key { enum flow_dissector_key_id key_id; size_t offset; /* offset of struct flow_dissector_key_* in target the struct */ }; struct flow_dissector { unsigned long long used_keys; /* each bit represents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; struct flow_keys_basic { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; }; struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT); struct flow_dissector_key_tags tags; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; #define FLOW_KEYS_HASH_OFFSET \ offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD) __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * * This structure is used to hold a digest of the full flow keys. This is a * larger "hash" of a flow to allow definitively matching specific flows where * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so * that it can be used in CB of skb (see sch_choke for an example). */ #define FLOW_KEYS_DIGEST_LEN 16 struct flow_keys_digest { u8 data[FLOW_KEYS_DIGEST_LEN]; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); static inline bool flow_keys_have_l4(const struct flow_keys *keys) { return (keys->ports.ports || keys->tags.flow_label); } u32 flow_hash_from_keys(struct flow_keys *keys); u32 flow_hash_from_keys_seed(struct flow_keys *keys, const siphash_key_t *keyval); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1ULL << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id, void *target_container) { return ((char *)target_container) + flow_dissector->offset[key_id]; } struct bpf_flow_dissector { struct bpf_flow_keys *flow_keys; const struct sk_buff *skb; const void *data; const void *data_end; }; static inline void flow_dissector_init_keys(struct flow_dissector_key_control *key_control, struct flow_dissector_key_basic *key_basic) { memset(key_control, 0, sizeof(*key_control)); memset(key_basic, 0, sizeof(*key_basic)); } #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif
2 275 707 689 183 743 436 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rpm #if !defined(_TRACE_RUNTIME_POWER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RUNTIME_POWER_H #include <linux/ktime.h> #include <linux/tracepoint.h> struct device; /* * The rpm_internal events are used for tracing some important * runtime pm internal functions. */ DECLARE_EVENT_CLASS(rpm_internal, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags), TP_STRUCT__entry( __string( name, dev_name(dev) ) __field( int, flags ) __field( int , usage_count ) __field( int , disable_depth ) __field( int , runtime_auto ) __field( int , request_pending ) __field( int , irq_safe ) __field( int , child_count ) ), TP_fast_assign( __assign_str(name); __entry->flags = flags; __entry->usage_count = atomic_read( &dev->power.usage_count); __entry->disable_depth = dev->power.disable_depth; __entry->runtime_auto = dev->power.runtime_auto; __entry->request_pending = dev->power.request_pending; __entry->irq_safe = dev->power.irq_safe; __entry->child_count = atomic_read( &dev->power.child_count); ), TP_printk("%s flags-%x cnt-%-2d dep-%-2d auto-%-1d p-%-1d" " irq-%-1d child-%d", __get_str(name), __entry->flags, __entry->usage_count, __entry->disable_depth, __entry->runtime_auto, __entry->request_pending, __entry->irq_safe, __entry->child_count ) ); DEFINE_EVENT(rpm_internal, rpm_suspend, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); DEFINE_EVENT(rpm_internal, rpm_resume, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); DEFINE_EVENT(rpm_internal, rpm_idle, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); DEFINE_EVENT(rpm_internal, rpm_usage, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); TRACE_EVENT(rpm_return_int, TP_PROTO(struct device *dev, unsigned long ip, int ret), TP_ARGS(dev, ip, ret), TP_STRUCT__entry( __string( name, dev_name(dev)) __field( unsigned long, ip ) __field( int, ret ) ), TP_fast_assign( __assign_str(name); __entry->ip = ip; __entry->ret = ret; ), TP_printk("%pS:%s ret=%d", (void *)__entry->ip, __get_str(name), __entry->ret) ); #define RPM_STATUS_STRINGS \ EM(RPM_INVALID, "RPM_INVALID") \ EM(RPM_ACTIVE, "RPM_ACTIVE") \ EM(RPM_RESUMING, "RPM_RESUMING") \ EM(RPM_SUSPENDED, "RPM_SUSPENDED") \ EMe(RPM_SUSPENDING, "RPM_SUSPENDING") /* Enums require being exported to userspace, for user tool parsing. */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); RPM_STATUS_STRINGS /* * Now redefine the EM() and EMe() macros to map the enums to the strings that * will be printed in the output. */ #undef EM #undef EMe #define EM(a, b) { a, b }, #define EMe(a, b) { a, b } TRACE_EVENT(rpm_status, TP_PROTO(struct device *dev, enum rpm_status status), TP_ARGS(dev, status), TP_STRUCT__entry( __string(name, dev_name(dev)) __field(int, status) ), TP_fast_assign( __assign_str(name); __entry->status = status; ), TP_printk("%s status=%s", __get_str(name), __print_symbolic(__entry->status, RPM_STATUS_STRINGS)) ); #endif /* _TRACE_RUNTIME_POWER_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 112 2733 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/scm.h> #include <uapi/linux/netlink.h> struct net; void do_trace_netlink_extack(const char *msg); static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; } enum netlink_skb_flags { NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; __u32 flags; struct sock *sk; bool nsid_is_set; int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) #define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); void netlink_table_ungrab(void); #define NL_CFG_F_NONROOT_RECV (1 << 0) #define NL_CFG_F_NONROOT_SEND (1 << 1) /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg); static inline struct sock * netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) { return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 8 #define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute * @miss_type: attribute type which was missing * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length * @_msg_buf: output buffer for formatted message strings - don't access * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; const struct nlattr *miss_nest; u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) \ __extack->_msg = __msg; \ } while (0) /* We splice fmt with %s at each end even in the snprintf so that both calls * can use the same string constant, avoiding its duplication in .ro */ #define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ } while (0) #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) #define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) #define NL_SET_ERR_MSG_WEAK(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG((extack), msg); \ } while (0) #define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG_MOD((extack), msg); \ } while (0) #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ (extack)->policy = (pol); \ } \ } while (0) #define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) #define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } \ } while (0) #define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) #define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ __extack->miss_nest = (nest); \ __extack->miss_type = (type); \ } \ } while (0) #define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ struct nlattr **__tb = (tb); \ u32 __attr = (type); \ int __retval; \ \ __retval = !__tb[__attr]; \ if (__retval) \ NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ __retval; \ }) static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { if (!extack) return; BUILD_BUG_ON(sizeof(extack->cookie) < sizeof(cookie)); memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack); int netlink_has_listeners(struct sock *sk, unsigned int group); bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); static inline struct sk_buff * netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *nskb; nskb = skb_clone(skb, gfp_mask); if (!nskb) return NULL; /* This is a large skb, set destructor callback to release head */ if (is_vmalloc_addr(skb->head)) nskb->destructor = skb->destructor; return nskb; } /* * skb should fit one page. This choice is good for headerless malloc. * But we should limit to 8K so that userspace does not have to * use enormous buffer sizes on recvmsg() calls just to avoid * MSG_TRUNC when PAGE_SIZE is very large. */ #if PAGE_SIZE < 8192UL #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE) #else #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL) #endif #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; /* the module that dump function belong to */ struct module *module; struct netlink_ext_ack *extack; u16 family; u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; int flags; bool strict_check; union { u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. */ long args[6]; }; }; #define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) struct netlink_notify { struct net *net; u32 portid; int protocol; }; struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); struct netlink_ext_ack *extack; void *data; struct module *module; u32 min_dump_alloc; int flags; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (!control->module) control->module = THIS_MODULE; return __netlink_dump_start(ssk, skb, nlh, control); } struct netlink_tap { struct net_device *dev; struct module *module; struct list_head list; }; int netlink_add_tap(struct netlink_tap *nt); int netlink_remove_tap(struct netlink_tap *nt); bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *ns, int cap); bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 // SPDX-License-Identifier: GPL-2.0-or-later /* * MPLS GSO Support * * Authors: Simon Horman (horms@verge.net.au) * * Based on: GSO portions of net/ipv4/gre.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/err.h> #include <linux/module.h> #include <linux/netdev_features.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/gso.h> #include <net/mpls.h> static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; netdev_features_t mpls_features; u16 mac_len = skb->mac_len; __be16 mpls_protocol; unsigned int mpls_hlen; if (!skb_inner_network_header_was_set(skb)) goto out; skb_reset_network_header(skb); mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN)) goto out; if (unlikely(!pskb_may_pull(skb, mpls_hlen))) goto out; /* Setup inner SKB. */ mpls_protocol = skb->protocol; skb->protocol = skb->inner_protocol; __skb_pull(skb, mpls_hlen); skb->mac_len = 0; skb_reset_mac_header(skb); /* Segment inner packet. */ mpls_features = skb->dev->mpls_features & features; segs = skb_mac_gso_segment(skb, mpls_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset, mac_len); goto out; } skb = segs; mpls_hlen += mac_len; do { skb->mac_len = mac_len; skb->protocol = mpls_protocol; skb_reset_inner_network_header(skb); __skb_push(skb, mpls_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); } while ((skb = skb->next)); out: return segs; } static struct packet_offload mpls_mc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_MC), .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, }; static struct packet_offload mpls_uc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, }; static int __init mpls_gso_init(void) { pr_info("MPLS GSO support\n"); dev_add_offload(&mpls_uc_offload); dev_add_offload(&mpls_mc_offload); return 0; } static void __exit mpls_gso_exit(void) { dev_remove_offload(&mpls_uc_offload); dev_remove_offload(&mpls_mc_offload); } module_init(mpls_gso_init); module_exit(mpls_gso_exit); MODULE_DESCRIPTION("MPLS GSO support"); MODULE_AUTHOR("Simon Horman <horms@verge.net.au>"); MODULE_LICENSE("GPL");
234 118 171 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pagemap.h> #include <linux/blkdev.h> #include "../blk.h" /* * add_gd_partition adds a partitions details to the devices partition * description. */ struct parsed_partitions { struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; sector_t size; int flags; bool has_info; struct partition_meta_info info; } *parts; int next; int limit; bool access_beyond_eod; char *pp_buf; }; typedef struct { struct folio *v; } Sector; void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); static inline void put_dev_sector(Sector p) { folio_put(p.v); } static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { if (n < p->limit) { char tmp[1 + BDEVNAME_SIZE + 10 + 1]; p->parts[n].from = from; p->parts[n].size = size; snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); strlcat(p->pp_buf, tmp, PAGE_SIZE); } } /* detection routines go here in alphabetical order: */ int adfspart_check_ADFS(struct parsed_partitions *state); int adfspart_check_CUMANA(struct parsed_partitions *state); int adfspart_check_EESOX(struct parsed_partitions *state); int adfspart_check_ICS(struct parsed_partitions *state); int adfspart_check_POWERTEC(struct parsed_partitions *state); int aix_partition(struct parsed_partitions *state); int amiga_partition(struct parsed_partitions *state); int atari_partition(struct parsed_partitions *state); int cmdline_partition(struct parsed_partitions *state); int efi_partition(struct parsed_partitions *state); int ibm_partition(struct parsed_partitions *); int karma_partition(struct parsed_partitions *state); int ldm_partition(struct parsed_partitions *state); int mac_partition(struct parsed_partitions *state); int msdos_partition(struct parsed_partitions *state); int of_partition(struct parsed_partitions *state); int osf_partition(struct parsed_partitions *state); int sgi_partition(struct parsed_partitions *state); int sun_partition(struct parsed_partitions *state); int sysv68_partition(struct parsed_partitions *state); int ultrix_partition(struct parsed_partitions *state);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ipv6.h> #include <net/dsfield.h> #include <net/xfrm.h> #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 static inline void xfrm4_extract_header(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; XFRM_MODE_SKB_CB(skb)->tos = iph->tos; XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); } static inline void xfrm6_extract_header(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *iph = ipv6_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = 0; XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; XFRM_MODE_SKB_CB(skb)->optlen = 0; memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); #else WARN_ON_ONCE(1); #endif } static inline void xfrm6_beet_make_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); iph->version = 6; memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, sizeof(iph->flow_lbl)); iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; } static inline void xfrm4_beet_make_header(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->ihl = 5; iph->version = 4; iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; iph->tos = XFRM_MODE_SKB_CB(skb)->tos; iph->id = XFRM_MODE_SKB_CB(skb)->id; iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; } #endif
15 199 202 51 14 35 31 202 18 5 78 68 202 133 3 53 49 202 133 54 3 14 183 182 26 27 33 564 525 493 32 523 22 10 32 146 147 120 31 15 15 46 5 469 2 467 424 467 434 27 517 480 37 25 22 12 25 507 12 517 29 10 10 42 19 24 8 35 35 35 33 4 2 2 16 16 9 21 3 9 48 35 13 119 119 119 82 82 38 101 18 68 18 18 1 14 14 14 8 6 1 2 302 302 125 11 179 6 71 102 102 101 157 19 3 3 182 183 183 25 10 170 20 38 82 37 7 112 112 52 29 1 29 111 112 112 102 19 19 7 12 19 19 1 4 70 56 1 7 4 16 33 8 25 16 5 36 2 37 1 2 8 2 1 4 1 143 2 139 137 49 50 135 1 136 65 64 2 110 112 112 13 15 4 11 1007 1012 432 432 424 432 424 109 33 76 4 38 41 78 1 79 31 16 10 1 521 493 28 13 13 13 586 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/rculist.h> #include <linux/err.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/udp.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) { return hash_32((__force u32)key ^ (__force u32)remote, IP_TNL_HASH_BITS); } static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p, const unsigned long *flags, __be32 key) { if (!test_bit(IP_TUNNEL_KEY_BIT, flags)) return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags); return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key; } /* Fallback tunnel: no source, no destination, no key, no options Tunnel hash table: We require exact key match i.e. if a key is present in packet it will match only tunnel with the same key; if it is not present, it will match only keyless tunnel. All keysless packets, if not matched configured keyless tunnels will match fallback tunnel. Given src, dst and key, find appropriate for input tunnel. */ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key) { struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; struct net_device *ndev; unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { if (local != t->parms.iph.saddr || remote != t->parms.iph.daddr || !(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (READ_ONCE(t->parms.link) == link) return t; cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { if (remote != t->parms.iph.daddr || t->parms.iph.saddr != 0 || !(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (READ_ONCE(t->parms.link) == link) return t; if (!cand) cand = t; } hash = ip_tunnel_hash(key, 0); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) continue; if (!(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (READ_ONCE(t->parms.link) == link) return t; if (!cand) cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) && t->parms.i_key != key) || t->parms.iph.saddr != 0 || t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; if (READ_ONCE(t->parms.link) == link) return t; if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(itn->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; ndev = READ_ONCE(itn->fb_tunnel_dev); if (ndev && ndev->flags & IFF_UP) return netdev_priv(ndev); return NULL; } EXPORT_SYMBOL_GPL(ip_tunnel_lookup); static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, struct ip_tunnel_parm_kern *parms) { unsigned int h; __be32 remote; __be32 i_key = parms->i_key; if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) remote = parms->iph.daddr; else remote = 0; if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) && test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags)) i_key = 0; h = ip_tunnel_hash(i_key, remote); return &itn->tunnels[h]; } static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) { struct hlist_head *head = ip_bucket(itn, &t->parms); if (t->collect_md) rcu_assign_pointer(itn->collect_md_tun, t); hlist_add_head_rcu(&t->hash_node, head); } static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) { if (t->collect_md) rcu_assign_pointer(itn->collect_md_tun, NULL); hlist_del_init_rcu(&t->hash_node); } static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, struct ip_tunnel_parm_kern *parms, int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; IP_TUNNEL_DECLARE_FLAGS(flags); __be32 key = parms->i_key; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); ip_tunnel_flags_copy(flags, parms->i_flags); hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && link == READ_ONCE(t->parms.link) && type == t->dev->type && ip_tunnel_key_match(&t->parms, flags, key)) break; } return t; } static struct net_device *__ip_tunnel_create(struct net *net, const struct rtnl_link_ops *ops, struct ip_tunnel_parm_kern *parms) { int err; struct ip_tunnel *tunnel; struct net_device *dev; char name[IFNAMSIZ]; err = -E2BIG; if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; strscpy(name, parms->name); } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; strscpy(name, ops->kind); strcat(name, "%d"); } ASSERT_RTNL(); dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); if (!dev) { err = -ENOMEM; goto failed; } dev_net_set(dev, net); dev->rtnl_link_ops = ops; tunnel = netdev_priv(dev); tunnel->parms = *parms; tunnel->net = net; err = register_netdevice(dev); if (err) goto failed_free; return dev; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } static int ip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *iph; int hlen = LL_MAX_HEADER; int mtu = ETH_DATA_LEN; int t_hlen = tunnel->hlen + sizeof(struct iphdr); iph = &tunnel->parms.iph; /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { struct flowi4 fl4; struct rtable *rt; ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, iph->tos & INET_DSCP_MASK, tunnel->net, tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; dst_cache_reset(&tunnel->dst_cache); } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = min(tdev->mtu, IP_MAX_MTU); } dev->needed_headroom = t_hlen + hlen; mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); if (mtu < IPV4_MIN_MTU) mtu = IPV4_MIN_MTU; return mtu; } static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, struct ip_tunnel_parm_kern *parms) { struct ip_tunnel *nt; struct net_device *dev; int t_hlen; int mtu; int err; dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); mtu = ip_tunnel_bind_dev(dev); err = dev_set_mtu(dev, mtu); if (err) goto err_dev_set_mtu; nt = netdev_priv(dev); t_hlen = nt->hlen + sizeof(struct iphdr); dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU - t_hlen; if (dev->type == ARPHRD_ETHER) dev->max_mtu -= dev->hard_header_len; ip_tunnel_add(itn, nt); return nt; err_dev_set_mtu: unregister_netdevice(dev); return ERR_PTR(err); } void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info) { const struct iphdr *iph = ip_hdr(skb); const struct udphdr *udph; if (iph->protocol != IPPROTO_UDP) return; udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2)); info->encap.sport = udph->source; info->encap.dport = udph->dest; } EXPORT_SYMBOL(ip_tunnel_md_udp_encap); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error) { const struct iphdr *iph = ip_hdr(skb); int nh, err; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { DEV_STATS_INC(tunnel->dev, multicast); skb->pkt_type = PACKET_BROADCAST; } #endif if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } iph = (struct iphdr *)(skb->head + nh); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tunnel->dev->type == ARPHRD_ETHER) { skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; } if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip_tunnel_encap_ops **) &iptun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip_tunnel_encap_add_ops); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip_tunnel_encap_ops **) &iptun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip_tunnel_encap_del_ops); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df, const struct iphdr *inner_iph, int tunnel_hlen, __be32 dst, bool md) { struct ip_tunnel *tunnel = netdev_priv(dev); int pkt_size; int mtu; tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; pkt_size = skb->len - tunnel_hlen; pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; if (df) { mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; } else { mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; } if (skb_valid_dst(skb)) skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && (inner_iph->frag_off & htons(IP_DF)) && mtu < pkt_size) { icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -E2BIG; } } #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6; __be32 daddr; rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) : NULL; daddr = md ? dst : tunnel->parms.iph.daddr; if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { if ((daddr && !ipv4_is_multicast(daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); } } if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && mtu < pkt_size) { icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -E2BIG; } } #endif return 0; } void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { struct ip_tunnel *tunnel = netdev_priv(dev); u32 headroom = sizeof(struct iphdr); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; const struct iphdr *inner_iph; struct rtable *rt = NULL; struct flowi4 fl4; __be16 df = 0; u8 tos, ttl; bool use_cache; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET)) goto tx_error; key = &tun_info->key; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); inner_iph = (const struct iphdr *)skb_inner_network_header(skb); tos = key->tos; if (tos == 1) { if (skb->protocol == htons(ETH_P_IP)) tos = inner_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) tunnel_hlen = ip_encap_hlen(&tun_info->encap); if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0) goto tx_error; use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) df = htons(IP_DF); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, key->u.ipv4.dst, true)) { ip_rt_put(rt); goto tx_error; } tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = key->ttl; if (ttl == 0) { if (skb->protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; else ttl = ip4_dst_hoplimit(&rt->dst); } headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (skb_cow_head(skb, headroom)) { ip_rt_put(rt); goto tx_dropped; } ip_tunnel_adj_headroom(dev, headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); return; tx_error: DEV_STATS_INC(dev, tx_errors); goto kfree; tx_dropped: DEV_STATS_INC(dev, tx_dropped); kfree: kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info = NULL; const struct iphdr *inner_iph; unsigned int max_headroom; /* The extra header space needed */ struct rtable *rt = NULL; /* Route to the other host */ __be16 payload_protocol; bool use_cache = false; struct flowi4 fl4; bool md = false; bool connected; u8 tos, ttl; __be32 dst; __be16 df; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); payload_protocol = skb_protocol(skb, true); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ if (!skb_dst(skb)) { DEV_STATS_INC(dev, tx_fifo_errors); goto tx_error; } tun_info = skb_tunnel_info(skb); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && ip_tunnel_info_af(tun_info) == AF_INET && tun_info->key.u.ipv4.dst) { dst = tun_info->key.u.ipv4.dst; md = true; connected = true; } else if (payload_protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (payload_protocol == htons(ETH_P_IPV6)) { const struct in6_addr *addr6; struct neighbour *neigh; bool do_tx_error_icmp; int addr_type; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_error; addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } if ((addr_type & IPV6_ADDR_COMPATv4) == 0) do_tx_error_icmp = true; else { do_tx_error_icmp = false; dst = addr6->s6_addr32[3]; } neigh_release(neigh); if (do_tx_error_icmp) goto tx_error_icmp; } #endif else goto tx_error; if (!md) connected = false; } tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; if (payload_protocol == htons(ETH_P_IP)) { tos = inner_iph->tos; connected = false; } else if (payload_protocol == htons(ETH_P_IPV6)) { tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); connected = false; } } ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, tos & INET_DSCP_MASK, tunnel->net, READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) goto tx_error; if (connected && md) { use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); } else { rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : NULL; } if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl4.saddr); else if (!md && connected) dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } df = tnl_params->frag_off; if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off & htons(IP_DF)); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { ip_rt_put(rt); goto tx_error; } if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = tnl_params->ttl; if (ttl == 0) { if (payload_protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; #if IS_ENABLED(CONFIG_IPV6) else if (payload_protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; #endif else ttl = ip4_dst_hoplimit(&rt->dst); } max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (skb_cow_head(skb, max_headroom)) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return; } ip_tunnel_adj_headroom(dev, max_headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); return; #if IS_ENABLED(CONFIG_IPV6) tx_error_icmp: dst_link_failure(skb); #endif tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_tunnel_xmit); static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, struct ip_tunnel_parm_kern *p, bool set_mtu, __u32 fwmark) { ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; if (dev->type != ARPHRD_ETHER) { __dev_addr_set(dev, &p->iph.saddr, 4); memcpy(dev->broadcast, &p->iph.daddr, 4); } ip_tunnel_add(itn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; WRITE_ONCE(t->parms.link, p->link); t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) WRITE_ONCE(dev->mtu, mtu); } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); struct net *net = t->net; struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); switch (cmd) { case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); if (!t) t = netdev_priv(dev); } memcpy(p, &t->parms, sizeof(*p)); break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) { if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags)) p->i_key = 0; if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags)) p->o_key = 0; } t = ip_tunnel_find(itn, p, itn->type); if (cmd == SIOCADDTUNNEL) { if (!t) { t = ip_tunnel_create(net, itn, p); err = PTR_ERR_OR_ZERO(t); break; } err = -EEXIST; break; } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else { unsigned int nflags = 0; if (ipv4_is_multicast(p->iph.daddr)) nflags = IFF_BROADCAST; else if (p->iph.daddr) nflags = IFF_POINTOPOINT; if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { err = -EINVAL; break; } t = netdev_priv(dev); } } if (t) { err = 0; ip_tunnel_update(itn, t, dev, p, true, 0); } else { err = -ENOENT; } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == itn->fb_tunnel_dev) { err = -ENOENT; t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); if (!t) goto done; err = -EPERM; if (t == netdev_priv(itn->fb_tunnel_dev)) goto done; dev = t->dev; } unregister_netdevice(dev); err = 0; break; default: err = -EINVAL; } done: return err; } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, const void __user *data) { struct ip_tunnel_parm p; if (copy_from_user(&p, data, sizeof(p))) return false; strscpy(kp->name, p.name); kp->link = p.link; ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags); ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags); kp->i_key = p.i_key; kp->o_key = p.o_key; memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph))); return true; } EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user); bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp) { struct ip_tunnel_parm p; if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) || !ip_tunnel_flags_is_be16_compat(kp->o_flags)) return false; memset(&p, 0, sizeof(p)); strscpy(p.name, kp->name); p.link = kp->link; p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags); p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags); p.i_key = kp->i_key; p.o_key = kp->o_key; memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph))); return !copy_to_user(data, &p, sizeof(p)); } EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { struct ip_tunnel_parm_kern p; int err; if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); if (!err && !ip_tunnel_parm_to_user(data, &p)) return -EFAULT; return err; } EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); int max_mtu = IP_MAX_MTU - t_hlen; if (dev->type == ARPHRD_ETHER) max_mtu -= dev->hard_header_len; if (new_mtu < ETH_MIN_MTU) return -EINVAL; if (new_mtu > max_mtu) { if (strict) return -EINVAL; new_mtu = max_mtu; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) { return __ip_tunnel_change_mtu(dev, new_mtu, true); } EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); static void ip_tunnel_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); dst_cache_destroy(&tunnel->dst_cache); } void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_net *itn; itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { ip_tunnel_del(itn, netdev_priv(dev)); unregister_netdevice_queue(dev, head); } } EXPORT_SYMBOL_GPL(ip_tunnel_dellink); struct net *ip_tunnel_get_link_net(const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip_tunnel_get_link_net); int ip_tunnel_get_iflink(const struct net_device *dev) { const struct ip_tunnel *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->parms.link); } EXPORT_SYMBOL(ip_tunnel_get_iflink); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); struct ip_tunnel_parm_kern parms; unsigned int i; itn->rtnl_link_ops = ops; for (i = 0; i < IP_TNL_HASH_SIZE; i++) INIT_HLIST_HEAD(&itn->tunnels[i]); if (!ops || !net_has_fallback_tunnels(net)) { struct ip_tunnel_net *it_init_net; it_init_net = net_generic(&init_net, ip_tnl_net_id); itn->type = it_init_net->type; itn->fb_tunnel_dev = NULL; return 0; } memset(&parms, 0, sizeof(parms)); if (devname) strscpy(parms.name, devname, IFNAMSIZ); rtnl_lock(); itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { itn->fb_tunnel_dev->netns_immutable = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; } rtnl_unlock(); return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); void ip_tunnel_delete_net(struct net *net, unsigned int id, struct rtnl_link_ops *ops, struct list_head *head) { struct ip_tunnel_net *itn = net_generic(net, id); struct net_device *dev, *aux; int h; ASSERT_RTNL_NET(net); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); for (h = 0; h < IP_TNL_HASH_SIZE; h++) { struct ip_tunnel *t; struct hlist_node *n; struct hlist_head *thead = &itn->tunnels[h]; hlist_for_each_entry_safe(t, n, thead, hash_node) /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); } } EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *nt; struct ip_tunnel_net *itn; int mtu; int err; nt = netdev_priv(dev); itn = net_generic(net, nt->ip_tnl_net_id); if (nt->collect_md) { if (rtnl_dereference(itn->collect_md_tun)) return -EEXIST; } else { if (ip_tunnel_find(itn, p, dev->type)) return -EEXIST; } nt->net = net; nt->parms = *p; nt->fwmark = fwmark; err = register_netdevice(dev); if (err) goto err_register_netdevice; if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); mtu = ip_tunnel_bind_dev(dev); if (tb[IFLA_MTU]) { unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); if (dev->type == ARPHRD_ETHER) max -= dev->hard_header_len; mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); } err = dev_set_mtu(dev, mtu); if (err) goto err_dev_set_mtu; ip_tunnel_add(itn, nt); return 0; err_dev_set_mtu: unregister_netdevice(dev); err_register_netdevice: return err; } EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); struct net *net = tunnel->net; struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); if (dev == itn->fb_tunnel_dev) return -EINVAL; t = ip_tunnel_find(itn, p, dev->type); if (t) { if (t->dev != dev) return -EEXIST; } else { t = tunnel; if (dev->type != ARPHRD_ETHER) { unsigned int nflags = 0; if (ipv4_is_multicast(p->iph.daddr)) nflags = IFF_BROADCAST; else if (p->iph.daddr) nflags = IFF_POINTOPOINT; if ((dev->flags ^ nflags) & (IFF_POINTOPOINT | IFF_BROADCAST)) return -EINVAL; } } ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_changelink); int ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; int err; dev->needs_free_netdev = true; dev->priv_destructor = ip_tunnel_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (err) return err; err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { dst_cache_destroy(&tunnel->dst_cache); return err; } tunnel->dev = dev; strscpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; if (tunnel->collect_md) netif_keep_dst(dev); netdev_lockdep_set_classes(dev); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); void ip_tunnel_uninit(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net *net = tunnel->net; struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); ip_tunnel_del(itn, netdev_priv(dev)); if (itn->fb_tunnel_dev == dev) WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); /* Do least required initialization, rest of init is done in tunnel_init call */ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->ip_tnl_net_id = net_id; } EXPORT_SYMBOL_GPL(ip_tunnel_setup); MODULE_DESCRIPTION("IPv4 tunnel implementation library"); MODULE_LICENSE("GPL");
10 1 1 1 2 1 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> struct nft_byteorder { u8 sreg; u8 dreg; enum nft_byteorder_ops op:8; u8 len; u8 size; }; void nft_byteorder_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = &regs->data[priv->sreg]; u32 *dst = &regs->data[priv->dreg]; u16 *s16, *d16; unsigned int i; s16 = (void *)src; d16 = (void *)dst; switch (priv->size) { case 8: { u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); nft_reg_store64(&dst64[i], be64_to_cpu((__force __be64)src64)); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); nft_reg_store64(&dst64[i], src64); } break; } break; } case 4: switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 4; i++) dst[i] = ntohl((__force __be32)src[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 4; i++) dst[i] = (__force __u32)htonl(src[i]); break; } break; case 2: switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 2; i++) d16[i] = ntohs((__force __be16)s16[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 2; i++) d16[i] = (__force __u16)htons(s16[i]); break; } break; } } static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_byteorder_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_byteorder *priv = nft_expr_priv(expr); u32 size, len; int err; if (tb[NFTA_BYTEORDER_SREG] == NULL || tb[NFTA_BYTEORDER_DREG] == NULL || tb[NFTA_BYTEORDER_LEN] == NULL || tb[NFTA_BYTEORDER_SIZE] == NULL || tb[NFTA_BYTEORDER_OP] == NULL) return -EINVAL; priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); switch (priv->op) { case NFT_BYTEORDER_NTOH: case NFT_BYTEORDER_HTON: break; default: return -EINVAL; } err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size); if (err < 0) return err; priv->size = size; switch (priv->size) { case 2: case 4: case 8: break; default: return -EINVAL; } err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg, priv->len); if (err < 0) return err; return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, priv->len); } static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_byteorder *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_BYTEORDER_SREG, priv->sreg)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_BYTEORDER_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static bool nft_byteorder_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { struct nft_byteorder *priv = nft_expr_priv(expr); nft_reg_track_cancel(track, priv->dreg, priv->len); return false; } static const struct nft_expr_ops nft_byteorder_ops = { .type = &nft_byteorder_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), .eval = nft_byteorder_eval, .init = nft_byteorder_init, .dump = nft_byteorder_dump, .reduce = nft_byteorder_reduce, }; struct nft_expr_type nft_byteorder_type __read_mostly = { .name = "byteorder", .ops = &nft_byteorder_ops, .policy = nft_byteorder_policy, .maxattr = NFTA_BYTEORDER_MAX, .owner = THIS_MODULE, };
4432 4436 1541 535 535 535 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/netdev_lock.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc(sizeof(*me), flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); return qdisc_txq_has_no_queue(txq); } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { unsigned int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_dstats_rx_add(dev, len); else dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_dscp = ip4h_dscp(ip4h); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned int len = skb->len; netdev_tx_t ret; ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_dstats_tx_add(dev, len); else dev_dstats_tx_dropped(dev); return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6; rcu_read_lock(); rt6 = rcu_dereference(vrf->rt6); if (likely(rt6)) { dst = &rt6->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rt6) { dst = &rt6->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; rcu_assign_pointer(vrf->rt6, rt6); rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth; rcu_read_lock(); rth = rcu_dereference(vrf->rth); if (likely(rth)) { dst = &rth->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rth) { dst = &rth->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; rcu_assign_pointer(vrf->rth, rth); return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; skb_dst_drop(skb); rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); if (add_it) { err = fib_newrule(dev_net(dev), skb, nlh, NULL, true); if (err == -EEXIST) err = 0; } else { err = fib_delrule(dev_net(dev), skb, nlh, NULL, true); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ dev->netns_immutable = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EADDRNOTAVAIL; } } return 0; } static void vrf_dellink(struct net_device *dev, struct list_head *head) { struct net_device *port_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); vrf_map_unregister_dev(dev); unregister_netdevice_queue(dev, head); } static int vrf_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); struct nlattr **data = params->data; struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; if (!data || !data[IFLA_VRF_TABLE]) { NL_SET_ERR_MSG(extack, "VRF table id is missing"); return -EINVAL; } vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); if (vrf->tb_id == RT_TABLE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], "Invalid VRF table id"); return -EINVAL; } dev->priv_flags |= IFF_L3MDEV_MASTER; err = register_netdevice(dev); if (err) goto out; /* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet. */ vrf->ifindex = dev->ifindex; err = vrf_map_register_dev(dev, extack); if (err) { unregister_netdevice(dev); goto out; } net = dev_net(dev); nn_vrf = net_generic(net, vrf_net_id); add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } *add_fib_rules = false; } out: return err; } static size_t vrf_nl_getsize(const struct net_device *dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ } static int vrf_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); } static size_t vrf_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ } static int vrf_fill_slave_info(struct sk_buff *skb, const struct net_device *vrf_dev, const struct net_device *slave_dev) { struct net_vrf *vrf = netdev_priv(vrf_dev); if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) return -EMSGSIZE; return 0; } static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { [IFLA_VRF_TABLE] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vrf_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct net_vrf), .get_size = vrf_nl_getsize, .policy = vrf_nl_policy, .validate = vrf_validate, .fill_info = vrf_fillinfo, .get_slave_size = vrf_get_slave_size, .fill_slave_info = vrf_fill_slave_info, .newlink = vrf_newlink, .dellink = vrf_dellink, .setup = vrf_setup, .maxtype = IFLA_VRF_MAX, }; static int vrf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* only care about unregister events to drop slave references */ if (event == NETDEV_UNREGISTER) { struct net_device *vrf_dev; if (!netif_is_l3_slave(dev)) goto out; vrf_dev = netdev_master_upper_dev_get(dev); vrf_del_slave(vrf_dev, dev); } out: return NOTIFY_DONE; } static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; static int vrf_map_init(struct vrf_map *vmap) { spin_lock_init(&vmap->vmap_lock); hash_init(vmap->ht); vmap->strict_mode = false; return 0; } #ifdef CONFIG_SYSCTL static bool vrf_strict_mode(struct vrf_map *vmap) { bool strict_mode; vrf_map_lock(vmap); strict_mode = vmap->strict_mode; vrf_map_unlock(vmap); return strict_mode; } static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) { bool *cur_mode; int res = 0; vrf_map_lock(vmap); cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock; if (*cur_mode) { /* disable strict mode */ *cur_mode = false; } else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables. */ res = -EBUSY; goto unlock; } /* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table. */ *cur_mode = true; } unlock: vrf_map_unlock(vmap); return res; } static int vrf_shared_table_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->extra1; struct vrf_map *vmap = netns_vrf_map(net); int proc_strict_mode = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_strict_mode, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_strict_mode = vrf_strict_mode(vmap); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); return ret; } static const struct ctl_table vrf_table[] = { { .procname = "strict_mode", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = vrf_shared_table_handler, /* set by the vrf_netns_init */ .extra1 = NULL, }, }; static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { struct ctl_table *table; table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); if (!table) return -ENOMEM; /* init the extra1 parameter with the reference to current netns */ table[0].extra1 = net; nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, ARRAY_SIZE(vrf_table)); if (!nn_vrf->ctl_hdr) { kfree(table); return -ENOMEM; } return 0; } static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); const struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); kfree(table); } #else static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { return 0; } static void vrf_netns_exit_sysctl(struct net *net) { } #endif /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); nn_vrf->add_fib_rules = true; vrf_map_init(&nn_vrf->vmap); return vrf_netns_init_sysctl(net, nn_vrf); } static void __net_exit vrf_netns_exit(struct net *net) { vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .exit = vrf_netns_exit, .id = &vrf_net_id, .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) { int rc; register_netdevice_notifier(&vrf_notifier_block); rc = register_pernet_subsys(&vrf_net_ops); if (rc < 0) goto error; rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); if (rc < 0) goto unreg_pernet; rc = rtnl_link_register(&vrf_link_ops); if (rc < 0) goto table_lookup_unreg; return 0; table_lookup_unreg: l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); unreg_pernet: unregister_pernet_subsys(&vrf_net_ops); error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; } module_init(vrf_init_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); MODULE_VERSION(DRV_VERSION);
3 6 5 2 3 3 12 1 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV6 GSO/GRO offload support * Linux INET6 implementation * * TCPv6 GSO/GRO support */ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/inet6_hashtables.h> #include <net/gro.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, struct tcphdr *th) { #if IS_ENABLED(CONFIG_IPV6) const struct ipv6hdr *hdr; struct sk_buff *p; struct sock *sk; struct net *net; int iif, sdif; if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) return; p = tcp_gro_lookup(head, th); if (p) { NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; return; } inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) sock_gen_put(sk); #endif /* IS_ENABLED(CONFIG_IPV6) */ } INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct tcphdr *th; /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, ip6_gro_compute_pseudo)) goto flush; th = tcp_gro_pull_header(skb); if (!th) goto flush; tcp6_check_fraglist_gro(head, skb, th); return tcp_gro_receive(head, skb, th); flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; } INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; __skb_incr_checksum_unnecessary(skb); return 0; } th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, &iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; tcp_gro_complete(skb); return 0; } static void __tcpv6_gso_segment_csum(struct sk_buff *seg, struct in6_addr *oldip, const struct in6_addr *newip, __be16 *oldport, __be16 newport) { struct tcphdr *th = tcp_hdr(seg); if (!ipv6_addr_equal(oldip, newip)) { inet_proto_csum_replace16(&th->check, seg, oldip->s6_addr32, newip->s6_addr32, true); *oldip = *newip; } if (*oldport == newport) return; inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); *oldport = newport; } static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) { const struct tcphdr *th; const struct ipv6hdr *iph; struct sk_buff *seg; struct tcphdr *th2; struct ipv6hdr *iph2; seg = segs; th = tcp_hdr(seg); iph = ipv6_hdr(seg); th2 = tcp_hdr(seg->next); iph2 = ipv6_hdr(seg->next); if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && ipv6_addr_equal(&iph->saddr, &iph2->saddr) && ipv6_addr_equal(&iph->daddr, &iph2->daddr)) return segs; while ((seg = seg->next)) { th2 = tcp_hdr(seg); iph2 = ipv6_hdr(seg); __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, &th2->source, th->source); __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, &th2->dest, th->dest); } return segs; } static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb, netdev_features_t features) { skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); if (IS_ERR(skb)) return skb; return __tcpv6_gso_segment_list_csum(skb); } static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct tcphdr *th; if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { struct tcphdr *th = tcp_hdr(skb); if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) return __tcp6_gso_segment_list(skb, features); skb->ip_summed = CHECKSUM_NONE; } if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); /* Set up pseudo header, usually expect stack to have done * this. */ th->check = 0; skb->ip_summed = CHECKSUM_PARTIAL; __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); } return tcp_gso_segment(skb, features); } int __init tcpv6_offload_init(void) { net_hotdata.tcpv6_offload = (struct net_offload) { .callbacks = { .gso_segment = tcp6_gso_segment, .gro_receive = tcp6_gro_receive, .gro_complete = tcp6_gro_complete, }, }; return inet6_add_offload(&net_hotdata.tcpv6_offload, IPPROTO_TCP); }
168 90 97 48 1 1 3 1 43 32 1 30 1 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-or-later /** -*- linux-c -*- *********************************************************** * Linux PPP over X/Ethernet (PPPoX/PPPoE) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoE --- PPP over Ethernet (RFC 2516) * * Version: 0.5.2 * * Author: Michal Ostrowski <mostrows@speakeasy.net> * * 051000 : Initialization cleanup * * License: */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/init.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/kmod.h> #include <net/sock.h> #include <linux/uaccess.h> static const struct pppox_proto *pppox_protos[PX_MAX_PROTO + 1]; int register_pppox_proto(int proto_num, const struct pppox_proto *pp) { if (proto_num < 0 || proto_num > PX_MAX_PROTO) return -EINVAL; if (pppox_protos[proto_num]) return -EALREADY; pppox_protos[proto_num] = pp; return 0; } void unregister_pppox_proto(int proto_num) { if (proto_num >= 0 && proto_num <= PX_MAX_PROTO) pppox_protos[proto_num] = NULL; } void pppox_unbind_sock(struct sock *sk) { /* Clear connection to ppp device, if attached. */ if (sk->sk_state & (PPPOX_BOUND | PPPOX_CONNECTED)) { ppp_unregister_channel(&pppox_sk(sk)->chan); sk->sk_state = PPPOX_DEAD; } } EXPORT_SYMBOL(register_pppox_proto); EXPORT_SYMBOL(unregister_pppox_proto); EXPORT_SYMBOL(pppox_unbind_sock); int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int rc; lock_sock(sk); switch (cmd) { case PPPIOCGCHAN: { int index; rc = -ENOTCONN; if (!(sk->sk_state & PPPOX_CONNECTED)) break; rc = -EINVAL; index = ppp_channel_index(&po->chan); if (put_user(index , (int __user *) arg)) break; rc = 0; sk->sk_state |= PPPOX_BOUND; break; } default: rc = pppox_protos[sk->sk_protocol]->ioctl ? pppox_protos[sk->sk_protocol]->ioctl(sock, cmd, arg) : -ENOTTY; } release_sock(sk); return rc; } EXPORT_SYMBOL(pppox_ioctl); #ifdef CONFIG_COMPAT int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { if (cmd == PPPOEIOCSFWD32) cmd = PPPOEIOCSFWD; return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(pppox_compat_ioctl); #endif static int pppox_create(struct net *net, struct socket *sock, int protocol, int kern) { int rc = -EPROTOTYPE; if (protocol < 0 || protocol > PX_MAX_PROTO) goto out; rc = -EPROTONOSUPPORT; if (!pppox_protos[protocol]) request_module("net-pf-%d-proto-%d", PF_PPPOX, protocol); if (!pppox_protos[protocol] || !try_module_get(pppox_protos[protocol]->owner)) goto out; rc = pppox_protos[protocol]->create(net, sock, kern); module_put(pppox_protos[protocol]->owner); out: return rc; } static const struct net_proto_family pppox_proto_family = { .family = PF_PPPOX, .create = pppox_create, .owner = THIS_MODULE, }; static int __init pppox_init(void) { return sock_register(&pppox_proto_family); } static void __exit pppox_exit(void) { sock_unregister(PF_PPPOX); } module_init(pppox_init); module_exit(pppox_exit); MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>"); MODULE_DESCRIPTION("PPP over Ethernet driver (generic socket layer)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PPPOX);
75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-only /* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8 * * (C) 2002 by Harald Welte <laforge@netfilter.org> * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com> * * See RFC2474 for a description of the DSCP field within the IP Header. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/dsfield.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_DSCP.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_DSCP"); MODULE_ALIAS("ip6t_DSCP"); MODULE_ALIAS("ipt_TOS"); MODULE_ALIAS("ip6t_TOS"); #define XT_DSCP_ECN_MASK 3u static unsigned int dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static unsigned int dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static int dscp_tg_check(const struct xt_tgchk_param *par) { const struct xt_DSCP_info *info = par->targinfo; if (info->dscp > XT_DSCP_MAX) return -EDOM; return 0; } static unsigned int tos_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t orig, nv; orig = ipv4_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static unsigned int tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct ipv6hdr *iph = ipv6_hdr(skb); u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", .family = NFPROTO_IPV4, .checkentry = dscp_tg_check, .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "DSCP", .family = NFPROTO_IPV6, .checkentry = dscp_tg_check, .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV6, .table = "mangle", .target = tos_tg6, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, }; static int __init dscp_tg_init(void) { return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } static void __exit dscp_tg_exit(void) { xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } module_init(dscp_tg_init); module_exit(dscp_tg_exit);
19 4 20 12 29 25 3 12 12 5 5 5 5 3 5 3 3 6 624 411 224 223 2 2 222 1 34 9 22 3 2 29 12 16 14 3 2 12 12 2 3 9 9 5 2 10 5 10 6 11 5 10 6 23 535 31 505 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for logging packets to userspace via * nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ipt_ULOG.c: * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/netfilter_bridge.h> #include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/spinlock.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/list.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> #include <net/netns/generic.h> #include <linux/atomic.h> #include <linux/refcount.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ /* max packet size is limited by 16-bit struct nfattr nfa_len field */ #define NFULNL_COPY_RANGE_MAX (0xFFFF - NLA_HDRLEN) #define PRINTR(x, args...) do { if (net_ratelimit()) \ printk(x, ## args); } while (0); struct nfulnl_instance { struct hlist_node hlist; /* global list of instances */ spinlock_t lock; refcount_t use; /* use count */ unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; struct net *net; netns_tracker ns_tracker; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ u32 peer_portid; /* PORTID of the peer process */ /* configurable parameters */ unsigned int flushtimeout; /* timeout until queue flush */ unsigned int nlbufsiz; /* netlink buffer allocation size */ unsigned int qthreshold; /* threshold of the queue */ u_int32_t copy_range; u_int32_t seq; /* instance-local sequential counter */ u_int16_t group_num; /* number of this queue */ u_int16_t flags; u_int8_t copy_mode; struct rcu_head rcu; }; #define INSTANCE_BUCKETS 16 static unsigned int nfnl_log_net_id __read_mostly; struct nfnl_log_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; atomic_t global_seq; }; static struct nfnl_log_net *nfnl_log_pernet(struct net *net) { return net_generic(net, nfnl_log_net_id); } static inline u_int8_t instance_hashfn(u_int16_t group_num) { return ((group_num & 0xff) % INSTANCE_BUCKETS); } static struct nfulnl_instance * __instance_lookup(const struct nfnl_log_net *log, u16 group_num) { const struct hlist_head *head; struct nfulnl_instance *inst; head = &log->instance_table[instance_hashfn(group_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->group_num == group_num) return inst; } return NULL; } static inline void instance_get(struct nfulnl_instance *inst) { refcount_inc(&inst->use); } static struct nfulnl_instance * instance_lookup_get_rcu(const struct nfnl_log_net *log, u16 group_num) { struct nfulnl_instance *inst; inst = __instance_lookup(log, group_num); if (inst && !refcount_inc_not_zero(&inst->use)) inst = NULL; return inst; } static struct nfulnl_instance * instance_lookup_get(const struct nfnl_log_net *log, u16 group_num) { struct nfulnl_instance *inst; rcu_read_lock(); inst = instance_lookup_get_rcu(log, group_num); rcu_read_unlock(); return inst; } static void nfulnl_instance_free_rcu(struct rcu_head *head) { struct nfulnl_instance *inst = container_of(head, struct nfulnl_instance, rcu); put_net_track(inst->net, &inst->ns_tracker); kfree(inst); module_put(THIS_MODULE); } static void instance_put(struct nfulnl_instance *inst) { if (inst && refcount_dec_and_test(&inst->use)) call_rcu(&inst->rcu, nfulnl_instance_free_rcu); } static void nfulnl_timer(struct timer_list *t); static struct nfulnl_instance * instance_create(struct net *net, u_int16_t group_num, u32 portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; struct nfnl_log_net *log = nfnl_log_pernet(net); int err; spin_lock_bh(&log->instances_lock); if (__instance_lookup(log, group_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } if (!try_module_get(THIS_MODULE)) { kfree(inst); err = -EAGAIN; goto out_unlock; } INIT_HLIST_NODE(&inst->hlist); spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ refcount_set(&inst->use, 2); timer_setup(&inst->timer, nfulnl_timer, 0); inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC); inst->peer_user_ns = user_ns; inst->peer_portid = portid; inst->group_num = group_num; inst->qthreshold = NFULNL_QTHRESH_DEFAULT; inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; inst->copy_mode = NFULNL_COPY_PACKET; inst->copy_range = NFULNL_COPY_RANGE_MAX; hlist_add_head_rcu(&inst->hlist, &log->instance_table[instance_hashfn(group_num)]); spin_unlock_bh(&log->instances_lock); return inst; out_unlock: spin_unlock_bh(&log->instances_lock); return ERR_PTR(err); } static void __nfulnl_flush(struct nfulnl_instance *inst); /* called with BH disabled */ static void __instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ hlist_del_rcu(&inst->hlist); /* then flush all pending packets from skb */ spin_lock(&inst->lock); /* lockless readers wont be able to use us */ inst->copy_mode = NFULNL_COPY_DISABLED; if (inst->skb) __nfulnl_flush(inst); spin_unlock(&inst->lock); /* and finally put the refcount */ instance_put(inst); } static inline void instance_destroy(struct nfnl_log_net *log, struct nfulnl_instance *inst) { spin_lock_bh(&log->instances_lock); __instance_destroy(inst); spin_unlock_bh(&log->instances_lock); } static int nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, unsigned int range) { int status = 0; spin_lock_bh(&inst->lock); switch (mode) { case NFULNL_COPY_NONE: case NFULNL_COPY_META: inst->copy_mode = mode; inst->copy_range = 0; break; case NFULNL_COPY_PACKET: inst->copy_mode = mode; if (range == 0) range = NFULNL_COPY_RANGE_MAX; inst->copy_range = min_t(unsigned int, range, NFULNL_COPY_RANGE_MAX); break; default: status = -EINVAL; break; } spin_unlock_bh(&inst->lock); return status; } static int nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) { int status; spin_lock_bh(&inst->lock); if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) status = -ERANGE; else if (nlbufsiz > 131072) status = -ERANGE; else { inst->nlbufsiz = nlbufsiz; status = 0; } spin_unlock_bh(&inst->lock); return status; } static void nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) { spin_lock_bh(&inst->lock); inst->flushtimeout = timeout; spin_unlock_bh(&inst->lock); } static void nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) { spin_lock_bh(&inst->lock); inst->qthreshold = qthresh; spin_unlock_bh(&inst->lock); } static int nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) { spin_lock_bh(&inst->lock); inst->flags = flags; spin_unlock_bh(&inst->lock); return 0; } static struct sk_buff * nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; /* alloc skb which should be big enough for a whole multipart * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ skb = alloc_skb(pkt_size, GFP_ATOMIC); } } return skb; } static void __nfulnl_send(struct nfulnl_instance *inst) { if (inst->qlen > 1) { struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0, NLMSG_DONE, sizeof(struct nfgenmsg), 0); if (WARN_ONCE(!nlh, "bad nlskb size: %u, tailroom %d\n", inst->skb->len, skb_tailroom(inst->skb))) { kfree_skb(inst->skb); goto out; } } nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; } static void __nfulnl_flush(struct nfulnl_instance *inst) { /* timer holds a reference */ if (timer_delete(&inst->timer)) instance_put(inst); if (inst->skb) __nfulnl_send(inst); } static void nfulnl_timer(struct timer_list *t) { struct nfulnl_instance *inst = timer_container_of(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) __nfulnl_send(inst); spin_unlock_bh(&inst->lock); instance_put(inst); } static u32 nfulnl_get_bridge_size(const struct sk_buff *skb) { u32 size = 0; if (!skb_mac_header_was_set(skb)) return 0; if (skb_vlan_tag_present(skb)) { size += nla_total_size(0); /* nested */ size += nla_total_size(sizeof(u16)); /* id */ size += nla_total_size(sizeof(u16)); /* tag */ } if (skb->network_header > skb->mac_header) size += nla_total_size(skb->network_header - skb->mac_header); return size; } static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) return 0; if (skb_vlan_tag_present(skb)) { struct nlattr *nest; nest = nla_nest_start(inst->skb, NFULA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) || nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto)) goto nla_put_failure; nla_nest_end(inst->skb, nest); } if (skb->mac_header < skb->network_header) { int len = (int)(skb->network_header - skb->mac_header); if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } /* This is an inline function, we don't really care about a long * list of arguments */ static inline int __build_packet_message(struct nfnl_log_net *log, struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, u_int8_t pf, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, const char *prefix, unsigned int plen, const struct nfnl_ct_hook *nfnl_ct, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), 0, pf, NFNETLINK_V0, htons(inst->group_num)); if (!nlh) return -1; memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; pmsg.hook = hooknum; if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg)) goto nla_put_failure; if (prefix && nla_put(inst->skb, NFULA_PREFIX, plen, prefix)) goto nla_put_failure; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_thresh or * nf_log_packet. */ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physinif = nf_bridge_get_physinif(skb); if (physinif && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(physinif))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_thresh or * nf_log_packet. */ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { struct net_device *physoutdev; /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutdev = nf_bridge_get_physoutdev(skb); if (physoutdev && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(physoutdev->ifindex))) goto nla_put_failure; } #endif } if (skb->mark && nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark))) goto nla_put_failure; if (indev && skb->dev && skb_mac_header_was_set(skb) && skb_mac_header_len(skb) != 0) { struct nfulnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(skb, phw.hw_addr); if (len > 0) { phw.hw_addrlen = htons(len); if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (indev && skb_mac_header_was_set(skb)) { if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || nla_put_be16(inst->skb, NFULA_HWLEN, htons(skb->dev->hard_header_len))) goto nla_put_failure; hwhdrp = skb_mac_header(skb); if (skb->dev->type == ARPHRD_SIT) hwhdrp -= ETH_HLEN; if (hwhdrp >= skb->head && nla_put(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, hwhdrp)) goto nla_put_failure; } if (hooknum <= NF_INET_FORWARD) { struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true)); struct nfulnl_msg_packet_timestamp ts; ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } /* UID */ sk = skb->sk; if (sk && sk_fullsock(sk)) { read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { struct file *file = sk->sk_socket->file; const struct cred *cred = file->f_cred; struct user_namespace *user_ns = inst->peer_user_ns; __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid)); __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid)); read_unlock_bh(&sk->sk_callback_lock); if (nla_put_be32(inst->skb, NFULA_UID, uid) || nla_put_be32(inst->skb, NFULA_GID, gid)) goto nla_put_failure; } else read_unlock_bh(&sk->sk_callback_lock); } /* local sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ) && nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++))) goto nla_put_failure; /* global sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, htonl(atomic_inc_return(&log->global_seq)))) goto nla_put_failure; if (ct && nfnl_ct->build(inst->skb, ct, ctinfo, NFULA_CT, NFULA_CT_INFO) < 0) goto nla_put_failure; if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) && nfulnl_put_bridge(inst, skb) < 0) goto nla_put_failure; if (data_len) { struct nlattr *nla; int size = nla_attr_size(data_len); if (skb_tailroom(inst->skb) < nla_total_size(data_len)) goto nla_put_failure; nla = skb_put(inst->skb, nla_total_size(data_len)); nla->nla_type = NFULA_PAYLOAD; nla->nla_len = size; if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) BUG(); } nlh->nlmsg_len = inst->skb->tail - old_tail; return 0; nla_put_failure: PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); return -1; } static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { .ulog = { .copy_len = 0xffff, .group = 0, .qthreshold = 1, }, }, }; /* log handler for internal netfilter logging api */ static void nfulnl_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *li_user, const char *prefix) { size_t size; unsigned int data_len; struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; enum ip_conntrack_info ctinfo = 0; struct nf_conn *ct = NULL; if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; inst = instance_lookup_get_rcu(log, li->u.ulog.group); if (!inst) return; if (prefix) plen = strlen(prefix) + 1; /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t)) /* gid */ + nla_total_size(plen) /* prefix */ + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)) + nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */ if (in && skb_mac_header_was_set(skb)) { size += nla_total_size(skb->dev->hard_header_len) + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + nla_total_size(sizeof(u_int16_t)); /* hwlen */ } spin_lock_bh(&inst->lock); if (inst->flags & NFULNL_CFG_F_SEQ) size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (inst->flags & NFULNL_CFG_F_CONNTRACK) { nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) { ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) size += nfulnl_get_bridge_size(skb); qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ if (li->u.ulog.qthreshold) if (qthreshold > li->u.ulog.qthreshold) qthreshold = li->u.ulog.qthreshold; switch (inst->copy_mode) { case NFULNL_COPY_META: case NFULNL_COPY_NONE: data_len = 0; break; case NFULNL_COPY_PACKET: data_len = inst->copy_range; if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) && (li->u.ulog.copy_len < data_len)) data_len = li->u.ulog.copy_len; if (data_len > skb->len) data_len = skb->len; size += nla_total_size(data_len); break; case NFULNL_COPY_DISABLED: default: goto unlock_and_release; } if (inst->skb && size > skb_tailroom(inst->skb)) { /* either the queue len is too high or we don't have * enough room in the skb left. flush to userspace. */ __nfulnl_flush(inst); } if (!inst->skb) { inst->skb = nfulnl_alloc_skb(net, inst->peer_portid, inst->nlbufsiz, size); if (!inst->skb) goto alloc_failure; } inst->qlen++; __build_packet_message(log, inst, skb, data_len, pf, hooknum, in, out, prefix, plen, nfnl_ct, ct, ctinfo); if (inst->qlen >= qthreshold) __nfulnl_flush(inst); /* timer_pending always called within inst->lock, so there * is no chance of a race here */ else if (!timer_pending(&inst->timer)) { instance_get(inst); inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); add_timer(&inst->timer); } unlock_and_release: spin_unlock_bh(&inst->lock); instance_put(inst); return; alloc_failure: /* FIXME: statistics */ goto unlock_and_release; } static int nfulnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_log_net *log = nfnl_log_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock_bh(&log->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfulnl_instance *inst; struct hlist_head *head = &log->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock_bh(&log->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfulnl_rtnl_notifier = { .notifier_call = nfulnl_rcv_nl_event, }; static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { return -ENOTSUPP; } static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", .type = NF_LOG_TYPE_ULOG, .logfn = nfulnl_log_packet, .me = THIS_MODULE, }; static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) }, [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) }, [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 }, [NFULA_CFG_QTHRESH] = { .type = NLA_U32 }, [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 }, [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { struct nfnl_log_net *log = nfnl_log_pernet(info->net); u_int16_t group_num = ntohs(info->nfmsg->res_id); struct nfulnl_msg_config_cmd *cmd = NULL; struct nfulnl_instance *inst; u16 flags = 0; int ret = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = info->nfmsg->nfgen_family; cmd = nla_data(nfula[NFULA_CFG_CMD]); /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: return nf_log_bind_pf(info->net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: nf_log_unbind_pf(info->net, pf); return 0; } } inst = instance_lookup_get(log, group_num); if (inst && inst->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto out_put; } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfula[NFULA_CFG_FLAGS]) { flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS])); if ((flags & NFULNL_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_ULOG); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_ULOG); if (rcu_access_pointer(nfnl_ct_hook)) { ret = -EAGAIN; goto out_put; } #endif ret = -EOPNOTSUPP; goto out_put; } } if (cmd != NULL) { switch (cmd->command) { case NFULNL_CFG_CMD_BIND: if (inst) { ret = -EBUSY; goto out_put; } inst = instance_create(info->net, group_num, NETLINK_CB(skb).portid, sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; } break; case NFULNL_CFG_CMD_UNBIND: if (!inst) { ret = -ENODEV; goto out; } instance_destroy(log, inst); goto out_put; default: ret = -ENOTSUPP; goto out_put; } } else if (!inst) { ret = -ENODEV; goto out; } if (nfula[NFULA_CFG_MODE]) { struct nfulnl_msg_config_mode *params = nla_data(nfula[NFULA_CFG_MODE]); nfulnl_set_mode(inst, params->copy_mode, ntohl(params->copy_range)); } if (nfula[NFULA_CFG_TIMEOUT]) { __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); nfulnl_set_timeout(inst, ntohl(timeout)); } if (nfula[NFULA_CFG_NLBUFSIZ]) { __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); } if (nfula[NFULA_CFG_QTHRESH]) { __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); nfulnl_set_qthresh(inst, ntohl(qthresh)); } if (nfula[NFULA_CFG_FLAGS]) nfulnl_set_flags(inst, flags); out_put: instance_put(inst); out: return ret; } static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, .type = NFNL_CB_MUTEX, .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFULA_CFG_MAX, .policy = nfula_cfg_policy }, }; static const struct nfnetlink_subsystem nfulnl_subsys = { .name = "log", .subsys_id = NFNL_SUBSYS_ULOG, .cb_count = NFULNL_MSG_MAX, .cb = nfulnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct net *net, struct iter_state *st) { struct nfnl_log_net *log; if (!st) return NULL; log = nfnl_log_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { struct hlist_head *head = &log->instance_table[st->bucket]; if (!hlist_empty(head)) return rcu_dereference(hlist_first_rcu(head)); } return NULL; } static struct hlist_node *get_next(struct net *net, struct iter_state *st, struct hlist_node *h) { h = rcu_dereference(hlist_next_rcu(h)); while (!h) { struct nfnl_log_net *log; struct hlist_head *head; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; log = nfnl_log_pernet(net); head = &log->instance_table[st->bucket]; h = rcu_dereference(hlist_first_rcu(head)); } return h; } static struct hlist_node *get_idx(struct net *net, struct iter_state *st, loff_t pos) { struct hlist_node *head; head = get_first(net, st); if (head) while (pos && (head = get_next(net, st, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return get_idx(seq_file_net(s), s->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(seq_file_net(s), s->private, v); } static void seq_stop(struct seq_file *s, void *v) __releases(rcu) { rcu_read_unlock(); } static int seq_show(struct seq_file *s, void *v) { const struct nfulnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %6u %2u\n", inst->group_num, inst->peer_portid, inst->qlen, inst->copy_mode, inst->copy_range, inst->flushtimeout, refcount_read(&inst->use)); return 0; } static const struct seq_operations nful_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_log_net_init(struct net *net) { unsigned int i; struct nfnl_log_net *log = nfnl_log_pernet(net); #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; #endif for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&log->instance_table[i]); spin_lock_init(&log->instances_lock); #ifdef CONFIG_PROC_FS proc = proc_create_net("nfnetlink_log", 0440, net->nf.proc_netfilter, &nful_seq_ops, sizeof(struct iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif return 0; } static void __net_exit nfnl_log_net_exit(struct net *net) { struct nfnl_log_net *log = nfnl_log_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif nf_log_unset(net, &nfulnl_logger); for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&log->instance_table[i])); } static struct pernet_operations nfnl_log_net_ops = { .init = nfnl_log_net_init, .exit = nfnl_log_net_exit, .id = &nfnl_log_net_id, .size = sizeof(struct nfnl_log_net), }; static int __init nfnetlink_log_init(void) { int status; status = register_pernet_subsys(&nfnl_log_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); if (status < 0) { pr_err("failed to register logger\n"); goto cleanup_subsys; } return status; cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_log_net_ops); out: return status; } static void __exit nfnetlink_log_fini(void) { nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); } MODULE_DESCRIPTION("netfilter userspace logging"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); MODULE_ALIAS_NF_LOGGER(AF_INET, 1); MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */ MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */ module_init(nfnetlink_log_init); module_exit(nfnetlink_log_fini);
12 12 4788 3818 3819 2461 2458 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 // SPDX-License-Identifier: GPL-2.0 /* * Software nodes for the firmware node framework. * * Copyright (C) 2018, Intel Corporation * Author: Heikki Krogerus <heikki.krogerus@linux.intel.com> */ #include <linux/container_of.h> #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kobject.h> #include <linux/kstrtox.h> #include <linux/list.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/types.h> #include "base.h" struct swnode { struct kobject kobj; struct fwnode_handle fwnode; const struct software_node *node; int id; /* hierarchy */ struct ida child_ids; struct list_head entry; struct list_head children; struct swnode *parent; unsigned int allocated:1; unsigned int managed:1; }; static DEFINE_IDA(swnode_root_ids); static struct kset *swnode_kset; #define kobj_to_swnode(_kobj_) container_of(_kobj_, struct swnode, kobj) static const struct fwnode_operations software_node_ops; bool is_software_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &software_node_ops; } EXPORT_SYMBOL_GPL(is_software_node); #define to_swnode(__fwnode) \ ({ \ typeof(__fwnode) __to_swnode_fwnode = __fwnode; \ \ is_software_node(__to_swnode_fwnode) ? \ container_of(__to_swnode_fwnode, \ struct swnode, fwnode) : NULL; \ }) static inline struct swnode *dev_to_swnode(struct device *dev) { struct fwnode_handle *fwnode = dev_fwnode(dev); if (!fwnode) return NULL; if (!is_software_node(fwnode)) fwnode = fwnode->secondary; return to_swnode(fwnode); } static struct swnode * software_node_to_swnode(const struct software_node *node) { struct swnode *swnode = NULL; struct kobject *k; if (!node) return NULL; spin_lock(&swnode_kset->list_lock); list_for_each_entry(k, &swnode_kset->list, entry) { swnode = kobj_to_swnode(k); if (swnode->node == node) break; swnode = NULL; } spin_unlock(&swnode_kset->list_lock); return swnode; } const struct software_node *to_software_node(const struct fwnode_handle *fwnode) { const struct swnode *swnode = to_swnode(fwnode); return swnode ? swnode->node : NULL; } EXPORT_SYMBOL_GPL(to_software_node); struct fwnode_handle *software_node_fwnode(const struct software_node *node) { struct swnode *swnode = software_node_to_swnode(node); return swnode ? &swnode->fwnode : NULL; } EXPORT_SYMBOL_GPL(software_node_fwnode); /* -------------------------------------------------------------------------- */ /* property_entry processing */ static const struct property_entry * property_entry_get(const struct property_entry *prop, const char *name) { if (!prop) return NULL; for (; prop->name; prop++) if (!strcmp(name, prop->name)) return prop; return NULL; } static const void *property_get_pointer(const struct property_entry *prop) { if (!prop->length) return NULL; return prop->is_inline ? &prop->value : prop->pointer; } static const void *property_entry_find(const struct property_entry *props, const char *propname, size_t length) { const struct property_entry *prop; const void *pointer; prop = property_entry_get(props, propname); if (!prop) return ERR_PTR(-EINVAL); pointer = property_get_pointer(prop); if (!pointer) return ERR_PTR(-ENODATA); if (length > prop->length) return ERR_PTR(-EOVERFLOW); return pointer; } static int property_entry_count_elems_of_size(const struct property_entry *props, const char *propname, size_t length) { const struct property_entry *prop; prop = property_entry_get(props, propname); if (!prop) return -EINVAL; return prop->length / length; } static int property_entry_read_int_array(const struct property_entry *props, const char *name, unsigned int elem_size, void *val, size_t nval) { const void *pointer; size_t length; if (!val) return property_entry_count_elems_of_size(props, name, elem_size); if (!is_power_of_2(elem_size) || elem_size > sizeof(u64)) return -ENXIO; length = nval * elem_size; pointer = property_entry_find(props, name, length); if (IS_ERR(pointer)) return PTR_ERR(pointer); memcpy(val, pointer, length); return 0; } static int property_entry_read_string_array(const struct property_entry *props, const char *propname, const char **strings, size_t nval) { const void *pointer; size_t length; int array_len; /* Find out the array length. */ array_len = property_entry_count_elems_of_size(props, propname, sizeof(const char *)); if (array_len < 0) return array_len; /* Return how many there are if strings is NULL. */ if (!strings) return array_len; array_len = min_t(size_t, nval, array_len); length = array_len * sizeof(*strings); pointer = property_entry_find(props, propname, length); if (IS_ERR(pointer)) return PTR_ERR(pointer); memcpy(strings, pointer, length); return array_len; } static void property_entry_free_data(const struct property_entry *p) { const char * const *src_str; size_t i, nval; if (p->type == DEV_PROP_STRING) { src_str = property_get_pointer(p); nval = p->length / sizeof(*src_str); for (i = 0; i < nval; i++) kfree(src_str[i]); } if (!p->is_inline) kfree(p->pointer); kfree(p->name); } static bool property_copy_string_array(const char **dst_ptr, const char * const *src_ptr, size_t nval) { int i; for (i = 0; i < nval; i++) { dst_ptr[i] = kstrdup(src_ptr[i], GFP_KERNEL); if (!dst_ptr[i] && src_ptr[i]) { while (--i >= 0) kfree(dst_ptr[i]); return false; } } return true; } static int property_entry_copy_data(struct property_entry *dst, const struct property_entry *src) { const void *pointer = property_get_pointer(src); void *dst_ptr; size_t nval; /* * Properties with no data should not be marked as stored * out of line. */ if (!src->is_inline && !src->length) return -ENODATA; /* * Reference properties are never stored inline as * they are too big. */ if (src->type == DEV_PROP_REF && src->is_inline) return -EINVAL; if (src->length <= sizeof(dst->value)) { dst_ptr = &dst->value; dst->is_inline = true; } else { dst_ptr = kmalloc(src->length, GFP_KERNEL); if (!dst_ptr) return -ENOMEM; dst->pointer = dst_ptr; } if (src->type == DEV_PROP_STRING) { nval = src->length / sizeof(const char *); if (!property_copy_string_array(dst_ptr, pointer, nval)) { if (!dst->is_inline) kfree(dst->pointer); return -ENOMEM; } } else { memcpy(dst_ptr, pointer, src->length); } dst->length = src->length; dst->type = src->type; dst->name = kstrdup(src->name, GFP_KERNEL); if (!dst->name) { property_entry_free_data(dst); return -ENOMEM; } return 0; } /** * property_entries_dup - duplicate array of properties * @properties: array of properties to copy * * This function creates a deep copy of the given NULL-terminated array * of property entries. */ struct property_entry * property_entries_dup(const struct property_entry *properties) { struct property_entry *p; int i, n = 0; int ret; if (!properties) return NULL; while (properties[n].name) n++; p = kcalloc(n + 1, sizeof(*p), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); for (i = 0; i < n; i++) { ret = property_entry_copy_data(&p[i], &properties[i]); if (ret) { while (--i >= 0) property_entry_free_data(&p[i]); kfree(p); return ERR_PTR(ret); } } return p; } EXPORT_SYMBOL_GPL(property_entries_dup); /** * property_entries_free - free previously allocated array of properties * @properties: array of properties to destroy * * This function frees given NULL-terminated array of property entries, * along with their data. */ void property_entries_free(const struct property_entry *properties) { const struct property_entry *p; if (!properties) return; for (p = properties; p->name; p++) property_entry_free_data(p); kfree(properties); } EXPORT_SYMBOL_GPL(property_entries_free); /* -------------------------------------------------------------------------- */ /* fwnode operations */ static struct fwnode_handle *software_node_get(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); kobject_get(&swnode->kobj); return &swnode->fwnode; } static void software_node_put(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); kobject_put(&swnode->kobj); } static bool software_node_property_present(const struct fwnode_handle *fwnode, const char *propname) { struct swnode *swnode = to_swnode(fwnode); return !!property_entry_get(swnode->node->properties, propname); } static int software_node_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { struct swnode *swnode = to_swnode(fwnode); return property_entry_read_int_array(swnode->node->properties, propname, elem_size, val, nval); } static int software_node_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { struct swnode *swnode = to_swnode(fwnode); return property_entry_read_string_array(swnode->node->properties, propname, val, nval); } static const char * software_node_get_name(const struct fwnode_handle *fwnode) { const struct swnode *swnode = to_swnode(fwnode); return kobject_name(&swnode->kobj); } static const char * software_node_get_name_prefix(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; const char *prefix; parent = fwnode_get_parent(fwnode); if (!parent) return ""; /* Figure out the prefix from the parents. */ while (is_software_node(parent)) parent = fwnode_get_next_parent(parent); prefix = fwnode_get_name_prefix(parent); fwnode_handle_put(parent); /* Guess something if prefix was NULL. */ return prefix ?: "/"; } static struct fwnode_handle * software_node_get_parent(const struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); if (!swnode || !swnode->parent) return NULL; return fwnode_handle_get(&swnode->parent->fwnode); } static struct fwnode_handle * software_node_get_next_child(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct swnode *p = to_swnode(fwnode); struct swnode *c = to_swnode(child); if (!p || list_empty(&p->children) || (c && list_is_last(&c->entry, &p->children))) { fwnode_handle_put(child); return NULL; } if (c) c = list_next_entry(c, entry); else c = list_first_entry(&p->children, struct swnode, entry); fwnode_handle_put(child); return fwnode_handle_get(&c->fwnode); } static struct fwnode_handle * software_node_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { struct swnode *swnode = to_swnode(fwnode); struct swnode *child; if (!swnode || list_empty(&swnode->children)) return NULL; list_for_each_entry(child, &swnode->children, entry) { if (!strcmp(childname, kobject_name(&child->kobj))) { kobject_get(&child->kobj); return &child->fwnode; } } return NULL; } static int software_node_get_reference_args(const struct fwnode_handle *fwnode, const char *propname, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args) { struct swnode *swnode = to_swnode(fwnode); const struct software_node_ref_args *ref_array; const struct software_node_ref_args *ref; const struct property_entry *prop; struct fwnode_handle *refnode; u32 nargs_prop_val; int error; int i; prop = property_entry_get(swnode->node->properties, propname); if (!prop) return -ENOENT; if (prop->type != DEV_PROP_REF) return -EINVAL; /* * We expect that references are never stored inline, even * single ones, as they are too big. */ if (prop->is_inline) return -EINVAL; if ((index + 1) * sizeof(*ref) > prop->length) return -ENOENT; ref_array = prop->pointer; ref = &ref_array[index]; refnode = software_node_fwnode(ref->node); if (!refnode) return -ENOENT; if (nargs_prop) { error = property_entry_read_int_array(ref->node->properties, nargs_prop, sizeof(u32), &nargs_prop_val, 1); if (error) return error; nargs = nargs_prop_val; } if (nargs > NR_FWNODE_REFERENCE_ARGS) return -EINVAL; if (!args) return 0; args->fwnode = software_node_get(refnode); args->nargs = nargs; for (i = 0; i < nargs; i++) args->args[i] = ref->args[i]; return 0; } static struct fwnode_handle * swnode_graph_find_next_port(const struct fwnode_handle *parent, struct fwnode_handle *port) { struct fwnode_handle *old = port; while ((port = software_node_get_next_child(parent, old))) { /* * fwnode ports have naming style "port@", so we search for any * children that follow that convention. */ if (!strncmp(to_swnode(port)->node->name, "port@", strlen("port@"))) return port; old = port; } return NULL; } static struct fwnode_handle * software_node_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *endpoint) { struct swnode *swnode = to_swnode(fwnode); struct fwnode_handle *parent; struct fwnode_handle *port; if (!swnode) return NULL; if (endpoint) { port = software_node_get_parent(endpoint); parent = software_node_get_parent(port); } else { parent = software_node_get_named_child_node(fwnode, "ports"); if (!parent) parent = software_node_get(&swnode->fwnode); port = swnode_graph_find_next_port(parent, NULL); } for (; port; port = swnode_graph_find_next_port(parent, port)) { endpoint = software_node_get_next_child(port, endpoint); if (endpoint) { fwnode_handle_put(port); break; } } fwnode_handle_put(parent); return endpoint; } static struct fwnode_handle * software_node_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); const struct software_node_ref_args *ref; const struct property_entry *prop; if (!swnode) return NULL; prop = property_entry_get(swnode->node->properties, "remote-endpoint"); if (!prop || prop->type != DEV_PROP_REF || prop->is_inline) return NULL; ref = prop->pointer; return software_node_get(software_node_fwnode(ref[0].node)); } static struct fwnode_handle * software_node_graph_get_port_parent(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); swnode = swnode->parent; if (swnode && !strcmp(swnode->node->name, "ports")) swnode = swnode->parent; return swnode ? software_node_get(&swnode->fwnode) : NULL; } static int software_node_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { struct swnode *swnode = to_swnode(fwnode); const char *parent_name = swnode->parent->node->name; int ret; if (strlen("port@") >= strlen(parent_name) || strncmp(parent_name, "port@", strlen("port@"))) return -EINVAL; /* Ports have naming style "port@n", we need to select the n */ ret = kstrtou32(parent_name + strlen("port@"), 10, &endpoint->port); if (ret) return ret; endpoint->id = swnode->id; endpoint->local_fwnode = fwnode; return 0; } static const struct fwnode_operations software_node_ops = { .get = software_node_get, .put = software_node_put, .property_present = software_node_property_present, .property_read_bool = software_node_property_present, .property_read_int_array = software_node_read_int_array, .property_read_string_array = software_node_read_string_array, .get_name = software_node_get_name, .get_name_prefix = software_node_get_name_prefix, .get_parent = software_node_get_parent, .get_next_child_node = software_node_get_next_child, .get_named_child_node = software_node_get_named_child_node, .get_reference_args = software_node_get_reference_args, .graph_get_next_endpoint = software_node_graph_get_next_endpoint, .graph_get_remote_endpoint = software_node_graph_get_remote_endpoint, .graph_get_port_parent = software_node_graph_get_port_parent, .graph_parse_endpoint = software_node_graph_parse_endpoint, }; /* -------------------------------------------------------------------------- */ /** * software_node_find_by_name - Find software node by name * @parent: Parent of the software node * @name: Name of the software node * * The function will find a node that is child of @parent and that is named * @name. If no node is found, the function returns NULL. * * NOTE: you will need to drop the reference with fwnode_handle_put() after use. */ const struct software_node * software_node_find_by_name(const struct software_node *parent, const char *name) { struct swnode *swnode = NULL; struct kobject *k; if (!name) return NULL; spin_lock(&swnode_kset->list_lock); list_for_each_entry(k, &swnode_kset->list, entry) { swnode = kobj_to_swnode(k); if (parent == swnode->node->parent && swnode->node->name && !strcmp(name, swnode->node->name)) { kobject_get(&swnode->kobj); break; } swnode = NULL; } spin_unlock(&swnode_kset->list_lock); return swnode ? swnode->node : NULL; } EXPORT_SYMBOL_GPL(software_node_find_by_name); static struct software_node *software_node_alloc(const struct property_entry *properties) { struct property_entry *props; struct software_node *node; props = property_entries_dup(properties); if (IS_ERR(props)) return ERR_CAST(props); node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) { property_entries_free(props); return ERR_PTR(-ENOMEM); } node->properties = props; return node; } static void software_node_free(const struct software_node *node) { property_entries_free(node->properties); kfree(node); } static void software_node_release(struct kobject *kobj) { struct swnode *swnode = kobj_to_swnode(kobj); if (swnode->parent) { ida_free(&swnode->parent->child_ids, swnode->id); list_del(&swnode->entry); } else { ida_free(&swnode_root_ids, swnode->id); } if (swnode->allocated) software_node_free(swnode->node); ida_destroy(&swnode->child_ids); kfree(swnode); } static const struct kobj_type software_node_type = { .release = software_node_release, .sysfs_ops = &kobj_sysfs_ops, }; static struct fwnode_handle * swnode_register(const struct software_node *node, struct swnode *parent, unsigned int allocated) { struct swnode *swnode; int ret; swnode = kzalloc(sizeof(*swnode), GFP_KERNEL); if (!swnode) return ERR_PTR(-ENOMEM); ret = ida_alloc(parent ? &parent->child_ids : &swnode_root_ids, GFP_KERNEL); if (ret < 0) { kfree(swnode); return ERR_PTR(ret); } swnode->id = ret; swnode->node = node; swnode->parent = parent; swnode->kobj.kset = swnode_kset; fwnode_init(&swnode->fwnode, &software_node_ops); ida_init(&swnode->child_ids); INIT_LIST_HEAD(&swnode->entry); INIT_LIST_HEAD(&swnode->children); if (node->name) ret = kobject_init_and_add(&swnode->kobj, &software_node_type, parent ? &parent->kobj : NULL, "%s", node->name); else ret = kobject_init_and_add(&swnode->kobj, &software_node_type, parent ? &parent->kobj : NULL, "node%d", swnode->id); if (ret) { kobject_put(&swnode->kobj); return ERR_PTR(ret); } /* * Assign the flag only in the successful case, so * the above kobject_put() won't mess up with properties. */ swnode->allocated = allocated; if (parent) list_add_tail(&swnode->entry, &parent->children); kobject_uevent(&swnode->kobj, KOBJ_ADD); return &swnode->fwnode; } /** * software_node_register_node_group - Register a group of software nodes * @node_group: NULL terminated array of software node pointers to be registered * * Register multiple software nodes at once. If any node in the array * has its .parent pointer set (which can only be to another software_node), * then its parent **must** have been registered before it is; either outside * of this function or by ordering the array such that parent comes before * child. */ int software_node_register_node_group(const struct software_node * const *node_group) { unsigned int i; int ret; if (!node_group) return 0; for (i = 0; node_group[i]; i++) { ret = software_node_register(node_group[i]); if (ret) { software_node_unregister_node_group(node_group); return ret; } } return 0; } EXPORT_SYMBOL_GPL(software_node_register_node_group); /** * software_node_unregister_node_group - Unregister a group of software nodes * @node_group: NULL terminated array of software node pointers to be unregistered * * Unregister multiple software nodes at once. If parent pointers are set up * in any of the software nodes then the array **must** be ordered such that * parents come before their children. * * NOTE: If you are uncertain whether the array is ordered such that * parents will be unregistered before their children, it is wiser to * remove the nodes individually, in the correct order (child before * parent). */ void software_node_unregister_node_group(const struct software_node * const *node_group) { unsigned int i = 0; if (!node_group) return; while (node_group[i]) i++; while (i--) software_node_unregister(node_group[i]); } EXPORT_SYMBOL_GPL(software_node_unregister_node_group); /** * software_node_register - Register static software node * @node: The software node to be registered */ int software_node_register(const struct software_node *node) { struct swnode *parent = software_node_to_swnode(node->parent); if (software_node_to_swnode(node)) return -EEXIST; if (node->parent && !parent) return -EINVAL; return PTR_ERR_OR_ZERO(swnode_register(node, parent, 0)); } EXPORT_SYMBOL_GPL(software_node_register); /** * software_node_unregister - Unregister static software node * @node: The software node to be unregistered */ void software_node_unregister(const struct software_node *node) { struct swnode *swnode; swnode = software_node_to_swnode(node); if (swnode) fwnode_remove_software_node(&swnode->fwnode); } EXPORT_SYMBOL_GPL(software_node_unregister); struct fwnode_handle * fwnode_create_software_node(const struct property_entry *properties, const struct fwnode_handle *parent) { struct fwnode_handle *fwnode; struct software_node *node; struct swnode *p; if (IS_ERR(parent)) return ERR_CAST(parent); p = to_swnode(parent); if (parent && !p) return ERR_PTR(-EINVAL); node = software_node_alloc(properties); if (IS_ERR(node)) return ERR_CAST(node); node->parent = p ? p->node : NULL; fwnode = swnode_register(node, p, 1); if (IS_ERR(fwnode)) software_node_free(node); return fwnode; } EXPORT_SYMBOL_GPL(fwnode_create_software_node); void fwnode_remove_software_node(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); if (!swnode) return; kobject_put(&swnode->kobj); } EXPORT_SYMBOL_GPL(fwnode_remove_software_node); /** * device_add_software_node - Assign software node to a device * @dev: The device the software node is meant for. * @node: The software node. * * This function will make @node the secondary firmware node pointer of @dev. If * @dev has no primary node, then @node will become the primary node. The * function will register @node automatically if it wasn't already registered. */ int device_add_software_node(struct device *dev, const struct software_node *node) { struct swnode *swnode; int ret; /* Only one software node per device. */ if (dev_to_swnode(dev)) return -EBUSY; swnode = software_node_to_swnode(node); if (swnode) { kobject_get(&swnode->kobj); } else { ret = software_node_register(node); if (ret) return ret; swnode = software_node_to_swnode(node); } set_secondary_fwnode(dev, &swnode->fwnode); /* * If the device has been fully registered by the time this function is * called, software_node_notify() must be called separately so that the * symlinks get created and the reference count of the node is kept in * balance. */ if (device_is_registered(dev)) software_node_notify(dev); return 0; } EXPORT_SYMBOL_GPL(device_add_software_node); /** * device_remove_software_node - Remove device's software node * @dev: The device with the software node. * * This function will unregister the software node of @dev. */ void device_remove_software_node(struct device *dev) { struct swnode *swnode; swnode = dev_to_swnode(dev); if (!swnode) return; if (device_is_registered(dev)) software_node_notify_remove(dev); set_secondary_fwnode(dev, NULL); kobject_put(&swnode->kobj); } EXPORT_SYMBOL_GPL(device_remove_software_node); /** * device_create_managed_software_node - Create a software node for a device * @dev: The device the software node is assigned to. * @properties: Device properties for the software node. * @parent: Parent of the software node. * * Creates a software node as a managed resource for @dev, which means the * lifetime of the newly created software node is tied to the lifetime of @dev. * Software nodes created with this function should not be reused or shared * because of that. The function takes a deep copy of @properties for the * software node. * * Since the new software node is assigned directly to @dev, and since it should * not be shared, it is not returned to the caller. The function returns 0 on * success, and errno in case of an error. */ int device_create_managed_software_node(struct device *dev, const struct property_entry *properties, const struct software_node *parent) { struct fwnode_handle *p = software_node_fwnode(parent); struct fwnode_handle *fwnode; if (parent && !p) return -EINVAL; fwnode = fwnode_create_software_node(properties, p); if (IS_ERR(fwnode)) return PTR_ERR(fwnode); to_swnode(fwnode)->managed = true; set_secondary_fwnode(dev, fwnode); if (device_is_registered(dev)) software_node_notify(dev); return 0; } EXPORT_SYMBOL_GPL(device_create_managed_software_node); void software_node_notify(struct device *dev) { struct swnode *swnode; int ret; swnode = dev_to_swnode(dev); if (!swnode) return; kobject_get(&swnode->kobj); ret = sysfs_create_link(&dev->kobj, &swnode->kobj, "software_node"); if (ret) return; ret = sysfs_create_link(&swnode->kobj, &dev->kobj, dev_name(dev)); if (ret) { sysfs_remove_link(&dev->kobj, "software_node"); return; } } void software_node_notify_remove(struct device *dev) { struct swnode *swnode; swnode = dev_to_swnode(dev); if (!swnode) return; sysfs_remove_link(&swnode->kobj, dev_name(dev)); sysfs_remove_link(&dev->kobj, "software_node"); kobject_put(&swnode->kobj); if (swnode->managed) { set_secondary_fwnode(dev, NULL); kobject_put(&swnode->kobj); } } static int __init software_node_init(void) { swnode_kset = kset_create_and_add("software_nodes", NULL, kernel_kobj); if (!swnode_kset) return -ENOMEM; return 0; } postcore_initcall(software_node_init); static void __exit software_node_exit(void) { ida_destroy(&swnode_root_ids); kset_unregister(swnode_kset); } __exitcall(software_node_exit);
112 88 3 19 2 2 86 4 17 18 3 106 1 84 79 4 14 33 205 48 158 69 43 1 25 81 2 8 41 41 50 50 55 49 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 // SPDX-License-Identifier: GPL-2.0 #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/utime.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> #include <linux/filelock.h> static bool nsec_valid(long nsec) { if (nsec == UTIME_OMIT || nsec == UTIME_NOW) return true; return nsec >= 0 && nsec <= 999999999; } int vfs_utimes(const struct path *path, struct timespec64 *times) { int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; struct delegated_inode delegated_inode = { }; if (times) { if (!nsec_valid(times[0].tv_nsec) || !nsec_valid(times[1].tv_nsec)) return -EINVAL; if (times[0].tv_nsec == UTIME_NOW && times[1].tv_nsec == UTIME_NOW) times = NULL; } error = mnt_want_write(path->mnt); if (error) goto out; newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; if (times) { if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { newattrs.ia_atime = times[0]; newattrs.ia_valid |= ATTR_ATIME_SET; } if (times[1].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_MTIME; else if (times[1].tv_nsec != UTIME_NOW) { newattrs.ia_mtime = times[1]; newattrs.ia_valid |= ATTR_MTIME_SET; } /* * Tell setattr_prepare(), that this is an explicit time * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET * were used. */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); out: return error; } EXPORT_SYMBOL_GPL(vfs_utimes); static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) { struct path path; int lookup_flags = 0, error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) return error; error = vfs_utimes(&path, times); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } static int do_utimes_fd(int fd, struct timespec64 *times, int flags) { if (flags) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_utimes(&fd_file(f)->f_path, times); } /* * do_utimes - change times on filename or file descriptor * @dfd: open file descriptor, -1 or AT_FDCWD * @filename: path name or NULL * @times: new times or NULL * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment) * * If filename is NULL and dfd refers to an open file, then operate on * the file. Otherwise look up filename, possibly using dfd as a * starting point. * * If times==NULL, set access and modification to current time, * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags) { if (filename == NULL && dfd != AT_FDCWD) return do_utimes_fd(dfd, times, flags); return do_utimes_path(dfd, filename, times, flags); } SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, struct __kernel_timespec __user *, utimes, int, flags) { struct timespec64 tstimes[2]; if (utimes) { if ((get_timespec64(&tstimes[0], &utimes[0]) || get_timespec64(&tstimes[1], &utimes[1]))) return -EFAULT; /* Nothing to do, we must not even check the path. */ if (tstimes[0].tv_nsec == UTIME_OMIT && tstimes[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME /* * futimesat(), utimes() and utime() are older versions of utimensat() * that are provided for compatibility with traditional C libraries. * On modern architectures, we always use libc wrappers around * utimensat() instead. */ static long do_futimesat(int dfd, const char __user *filename, struct __kernel_old_timeval __user *utimes) { struct __kernel_old_timeval times[2]; struct timespec64 tstimes[2]; if (utimes) { if (copy_from_user(&times, utimes, sizeof(times))) return -EFAULT; /* This test is needed to catch all invalid values. If we would test only in do_utimes we would miss those invalid values truncated by the multiplication with 1000. Note that we also catch UTIME_{NOW,OMIT} here which are only valid for utimensat. */ if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 || times[1].tv_usec >= 1000000 || times[1].tv_usec < 0) return -EINVAL; tstimes[0].tv_sec = times[0].tv_sec; tstimes[0].tv_nsec = 1000 * times[0].tv_usec; tstimes[1].tv_sec = times[1].tv_sec; tstimes[1].tv_nsec = 1000 * times[1].tv_usec; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); } SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(dfd, filename, utimes); } SYSCALL_DEFINE2(utimes, char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(AT_FDCWD, filename, utimes); } SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) { struct timespec64 tv[2]; if (times) { if (get_user(tv[0].tv_sec, &times->actime) || get_user(tv[1].tv_sec, &times->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. */ #ifdef __ARCH_WANT_SYS_UTIME32 SYSCALL_DEFINE2(utime32, const char __user *, filename, struct old_utimbuf32 __user *, t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t->actime) || get_user(tv[1].tv_sec, &t->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } #endif SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; if (t) { if (get_old_timespec32(&tv[0], &t[0]) || get_old_timespec32(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, t ? tv : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME32 static long do_compat_futimesat(unsigned int dfd, const char __user *filename, struct old_timeval32 __user *t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || get_user(tv[0].tv_nsec, &t[0].tv_usec) || get_user(tv[1].tv_sec, &t[1].tv_sec) || get_user(tv[1].tv_nsec, &t[1].tv_usec)) return -EFAULT; if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) return -EINVAL; tv[0].tv_nsec *= 1000; tv[1].tv_nsec *= 1000; } return do_utimes(dfd, filename, t ? tv : NULL, 0); } SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } #endif #endif
6 19 19 24 4 24 17 16 17 2 5 1 2 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge per vlan tunnel port dst_metadata netlink control interface * * Authors: * Roopa Prabhu <roopa@cumulusnetworks.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <uapi/linux/if_bridge.h> #include <net/dst_metadata.h> #include "br_private.h" #include "br_private_tunnel.h" static size_t __get_vlan_tinfo_size(void) { return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */ nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */ nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */ } bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *v_last) { __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id); __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id); return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1; } static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL; int num_tinfos = 0; /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id) continue; if (!vtbegin) { goto initvars; } else if ((v->vid - vtend->vid) == 1 && vlan_tunid_inrange(v, vtend)) { vtend = v; continue; } else { if ((vtend->vid - vtbegin->vid) > 0) num_tinfos += 2; else num_tinfos += 1; } initvars: vtbegin = v; vtend = v; } if (vtbegin && vtend) { if ((vtend->vid - vtbegin->vid) > 0) num_tinfos += 2; else num_tinfos += 1; } return num_tinfos; } int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg) { int num_tinfos; if (!vg) return 0; rcu_read_lock(); num_tinfos = __get_num_vlan_tunnel_infos(vg); rcu_read_unlock(); return num_tinfos * __get_vlan_tinfo_size(); } static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid, __be64 tunnel_id, u16 flags) { __be32 tid = tunnel_id_to_key32(tunnel_id); struct nlattr *tmap; tmap = nla_nest_start_noflag(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO); if (!tmap) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID, be32_to_cpu(tid))) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID, vid)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS, flags)) goto nla_put_failure; nla_nest_end(skb, tmap); return 0; nla_put_failure: nla_nest_cancel(skb, tmap); return -EMSGSIZE; } static int br_fill_vlan_tinfo_range(struct sk_buff *skb, struct net_bridge_vlan *vtbegin, struct net_bridge_vlan *vtend) { int err; if (vtend && (vtend->vid - vtbegin->vid) > 0) { /* add range to skb */ err = br_fill_vlan_tinfo(skb, vtbegin->vid, vtbegin->tinfo.tunnel_id, BRIDGE_VLAN_INFO_RANGE_BEGIN); if (err) return err; err = br_fill_vlan_tinfo(skb, vtend->vid, vtend->tinfo.tunnel_id, BRIDGE_VLAN_INFO_RANGE_END); if (err) return err; } else { err = br_fill_vlan_tinfo(skb, vtbegin->vid, vtbegin->tinfo.tunnel_id, 0); if (err) return err; } return 0; } int br_fill_vlan_tunnel_info(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vtbegin = NULL; struct net_bridge_vlan *vtend = NULL; struct net_bridge_vlan *v; int err; /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v)) continue; if (!v->tinfo.tunnel_dst) continue; if (!vtbegin) { goto initvars; } else if ((v->vid - vtend->vid) == 1 && vlan_tunid_inrange(v, vtend)) { vtend = v; continue; } else { err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend); if (err) return err; } initvars: vtbegin = v; vtend = v; } if (vtbegin) { err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend); if (err) return err; } return 0; } static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = { [IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC] = { .strict_start_type = IFLA_BRIDGE_VLAN_TUNNEL_FLAGS + 1 }, [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 }, [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 }, }; int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, u16 vid, u32 tun_id, bool *changed) { int err = 0; if (!p) return -EINVAL; switch (cmd) { case RTM_SETLINK: err = nbp_vlan_tunnel_info_add(p, vid, tun_id); if (!err) *changed = true; break; case RTM_DELLINK: if (!nbp_vlan_tunnel_info_delete(p, vid)) *changed = true; break; } return err; } int br_parse_vlan_tunnel_info(struct nlattr *attr, struct vtunnel_info *tinfo) { struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1]; u32 tun_id; u16 vid, flags = 0; int err; memset(tinfo, 0, sizeof(*tinfo)); err = nla_parse_nested_deprecated(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, attr, vlan_tunnel_policy, NULL); if (err < 0) return err; if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] || !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]) return -EINVAL; tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]); vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]); if (vid >= VLAN_VID_MASK) return -ERANGE; if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]) flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]); tinfo->tunid = tun_id; tinfo->vid = vid; tinfo->flags = flags; return 0; } /* send a notification if v_curr can't enter the range and start a new one */ static void __vlan_tunnel_handle_range(const struct net_bridge_port *p, struct net_bridge_vlan **v_start, struct net_bridge_vlan **v_end, int v_curr, bool curr_change) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; vg = nbp_vlan_group(p); if (!vg) return; v = br_vlan_find(vg, v_curr); if (!*v_start) goto out_init; if (v && curr_change && br_vlan_can_enter_range(v, *v_end)) { *v_end = v; return; } br_vlan_notify(p->br, p, (*v_start)->vid, (*v_end)->vid, RTM_NEWVLAN); out_init: /* we start a range only if there are any changes to notify about */ *v_start = curr_change ? v : NULL; *v_end = *v_start; } int br_process_vlan_tunnel_info(const struct net_bridge *br, const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, struct vtunnel_info *tinfo_last, bool *changed) { int err; if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) return -EINVAL; memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info)); } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) { struct net_bridge_vlan *v_start = NULL, *v_end = NULL; int t, v; if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)) return -EINVAL; if ((tinfo_curr->vid - tinfo_last->vid) != (tinfo_curr->tunid - tinfo_last->tunid)) return -EINVAL; t = tinfo_last->tunid; for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) { bool curr_change = false; err = br_vlan_tunnel_info(p, cmd, v, t, &curr_change); if (err) break; t++; if (curr_change) *changed = curr_change; __vlan_tunnel_handle_range(p, &v_start, &v_end, v, curr_change); } if (v_start && v_end) br_vlan_notify(br, p, v_start->vid, v_end->vid, RTM_NEWVLAN); if (err) return err; memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } else { if (tinfo_last->flags) return -EINVAL; err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid, tinfo_curr->tunid, changed); if (err) return err; br_vlan_notify(br, p, tinfo_curr->vid, 0, RTM_NEWVLAN); memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } return 0; }
21 21 1 5 13 15 1 22 34 13 40 11118 107 2 49 363 12274 529 12656 49 362 12657 12656 12667 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> #include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> #include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex_types.h> #include <linux/plist_types.h> #include <linux/hrtimer_types.h> #include <linux/timer_types.h> #include <linux/seccomp_types.h> #include <linux/nodemask_types.h> #include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/spinlock.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <linux/rseq_types.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/uidgid_types.h> #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> #ifndef COMPILE_OFFSETS #include <generated/rq-offsets.h> #endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; struct user_event_mm; #include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif #define trace_set_current_state(state_value) \ do { \ if (tracepoint_enabled(sched_set_state_tp)) \ __trace_set_current_state(state_value); \ } while (0) /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /* wrapper functions to trace from this header file */ DECLARE_TRACEPOINT(sched_set_state_tp); extern void __trace_set_current_state(int state_value); DECLARE_TRACEPOINT(sched_set_need_resched_tp); extern void __trace_set_need_resched(struct task_struct *curr, int tif); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; /* Min time spent waiting on a runqueue: */ unsigned long long min_run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; unsigned int util_est; } ____cacheline_aligned; /* * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est at dequeue time. * Since max value of util_est for a task is 1024 (PELT util_avg for a task) * it is safe to use MSB. */ #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; s64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_vruntime; u64 min_slice; struct list_head group_node; unsigned char on_rq; unsigned char sched_delayed; unsigned char rel_deadline; unsigned char custom_slice; /* hole */ u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; union { /* * When !@on_rq this field is vlag. * When cfs_rq->curr == se (which implies @on_rq) * this field is vprot. See protect_slice(). */ s64 vlag; u64 vprot; }; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; struct rq_flags; typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. * * @dl_server tells if this is a server entity. * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * * @dl_defer tells if this is a deferred or regular server. For * now only defer server exists. * * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. * * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; /* * Bits for DL-server functionality. Also see the comment near * dl_server_update(). * * @rq the runqueue this server is for */ struct rq *rq; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_MEM_ALLOC_PROFILING struct alloc_tag *alloc_tag; #endif int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #ifdef CONFIG_CFS_BANDWIDTH struct callback_head sched_throttle_work; struct list_head throttle_node; bool throttled; #endif #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; unsigned short migration_disabled; unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; int rcu_tasks_exit_cpu; struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; struct mm_struct *mm; struct mm_struct *active_mm; struct address_space *faults_disabled_mapping; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized begin_new_exec() * - set it with set_task_comm() * - strscpy_pad() to ensure it is always NUL-terminated and * zero-padded * - task_lock() to ensure the operation is atomic and the name is * fully updated. */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from * <linux/hung_task.h>). Accessed via hung_task_*() helpers. */ unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; #endif #ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif /* Used by BPF for per-TASK xdp storage */ struct bpf_net_context *bpf_net_context; #ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; #endif #ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. * If memory becomes a concern, we can think about a dynamic method. */ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif #ifdef CONFIG_UNWIND_USER struct unwind_task_info unwind_info; #endif /* CPU-specific state of this task: */ struct thread_struct thread; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end } __attribute__ ((aligned (64))); #ifdef CONFIG_SCHED_PROXY_EXEC DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); static inline bool sched_proxy_exec(void) { return static_branch_likely(&__sched_proxy_exec); } #else static inline bool sched_proxy_exec(void) { return false; } #endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. * Report frozen tasks as uninterruptible. */ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. * See memalloc_pin_save() */ #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); /* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { struct task_struct task; #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk); extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ __set_task_comm(tsk, from, false); \ }) /* * - Why not use task_lock()? * User space can randomly change their names anyway, so locking for readers * doesn't make sense. For writers, locking is probably necessary, as a race * condition could lead to long-term mixed results. * The strscpy_pad() in __set_task_comm() can ensure that the task comm is * always NUL-terminated and zero-padded. Therefore the race condition between * reader and writer is not an issue. * * - BUILD_BUG_ON() can help prevent the buf from being truncated. * Since the callers don't perform any return value checks, this safeguard is * necessary. */ #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ strscpy_pad(buf, (tsk)->comm); \ buf; \ }) static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { if (tracepoint_enabled(sched_set_need_resched_tp) && !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } static inline void set_need_resched_current(void) { lockdep_assert_irqs_disabled(); set_tsk_need_resched(current); set_preempt_need_resched(); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { struct mutex *m = p->blocked_on; if (m) lockdep_assert_held_once(&m->wait_lock); return m; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ WARN_ON_ONCE(blocked_on && blocked_on != m); WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __set_task_blocked_on(p, m); } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { if (m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * There may be cases where we re-clear already cleared * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ WARN_ON_ONCE(blocked_on && blocked_on != m); } WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __clear_task_blocked_on(p, m); } #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ static inline bool task_is_runnable(struct task_struct *p) { return p->on_rq && !p->se.sched_delayed; } extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); #endif current->alloc_tag = old; } #else #define alloc_tag_save(_tag) NULL #define alloc_tag_restore(_tag, _old) do {} while (0) #endif /* Avoids recursive inclusion hell */ #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); static __always_inline int task_mm_cid(struct task_struct *t) { return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } static __always_inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return task_cpu(t); } #endif #ifndef MODULE #ifndef COMPILE_OFFSETS extern void ___migrate_enable(void); struct rq; DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); /* * The "struct rq" is not available here, so we can't access the * "runqueues" with this_cpu_ptr(), as the compilation will fail in * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): * typeof((ptr) + 0) * * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. */ #ifdef CONFIG_SMP #define this_rq_raw() arch_raw_cpu_ptr(&runqueues) #else #define this_rq_raw() PERCPU_PTR(&runqueues) #endif #define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) static inline void __migrate_enable(void) { struct task_struct *p = current; #ifdef CONFIG_DEBUG_PREEMPT /* * Check both overflow from migrate_disable() and superfluous * migrate_enable(). */ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) return; #endif if (p->migration_disabled > 1) { p->migration_disabled--; return; } /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ guard(preempt)(); if (unlikely(p->cpus_ptr != &p->cpus_mask)) ___migrate_enable(); /* * Mustn't clear migration_disabled() until cpus_ptr points back at the * regular cpus_mask, otherwise things that race (eg. * select_fallback_rq) get confused. */ barrier(); p->migration_disabled = 0; this_rq_pinned()--; } static inline void __migrate_disable(void) { struct task_struct *p = current; if (p->migration_disabled) { #ifdef CONFIG_DEBUG_PREEMPT /* *Warn about overflow half-way through the range. */ WARN_ON_ONCE((s16)p->migration_disabled < 0); #endif p->migration_disabled++; return; } guard(preempt)(); this_rq_pinned()++; p->migration_disabled = 1; } #else /* !COMPILE_OFFSETS */ static inline void __migrate_disable(void) { } static inline void __migrate_enable(void) { } #endif /* !COMPILE_OFFSETS */ /* * So that it is possible to not export the runqueues variable, define and * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will * be defined in kernel/sched/core.c. */ #ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE static __always_inline void migrate_disable(void) { __migrate_disable(); } static __always_inline void migrate_enable(void) { __migrate_enable(); } #else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ #else /* MODULE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* MODULE */ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #endif
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ #include <net/l3mdev.h> #include <net/netfilter/nf_tables.h> struct nft_fib { u8 dreg; u8 result; u32 flags; }; extern const struct nla_policy nft_fib_policy[]; static inline bool nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) { return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) { const struct net_device *indev = nft_in(pkt); const struct sock *sk; switch (nft_hook(pkt)) { case NF_INET_PRE_ROUTING: case NF_INET_INGRESS: case NF_INET_LOCAL_IN: break; default: return false; } sk = pkt->skb->sk; if (sk && sk_fullsock(sk)) return sk->sk_rx_dst_ifindex == indev->ifindex; return nft_fib_is_loopback(pkt->skb, indev); } static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, const struct net_device *iif) { const struct net_device *dev = iif ? iif : pkt->skb->dev; return l3mdev_master_ifindex_rcu(dev); } int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr); void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct net_device *dev); bool nft_fib_reduce(struct nft_regs_track *track, const struct nft_expr *expr); #endif
60 8 42 61 60 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include "xsk_queue.h" static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; struct xdp_rxtx_ring *rxtx_ring; if (umem_queue) return struct_size(umem_ring, desc, q->nentries); return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; size_t size; q = kzalloc(sizeof(*q), GFP_KERNEL); if (!q) return NULL; q->nentries = nentries; q->ring_mask = nentries - 1; size = xskq_get_ring_size(q, umem_queue); /* size which is overflowing or close to SIZE_MAX will become 0 in * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous * is_power_of_2(), the rest will be handled by vmalloc_user() */ if (unlikely(size == SIZE_MAX)) { kfree(q); return NULL; } size = PAGE_ALIGN(size); q->ring = vmalloc_user(size); if (!q->ring) { kfree(q); return NULL; } q->ring_vmalloc_size = size; return q; } void xskq_destroy(struct xsk_queue *q) { if (!q) return; vfree(q->ring); kfree(q); }
15 5 3 14 16 27 3 1 1 1 4 1 2 1 2 1 1 1 4 4 3 3 1 1 2 1 1 10 5 2 1 2 1 6 1 2 3 2 1 1 2 2 1 31 31 31 31 31 31 31 31 31 31 31 31 20 20 3 68 68 31 68 68 68 68 68 68 68 68 32 32 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 68 68 31 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2011 ProFUSION Embedded Systems Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI core. */ #include <linux/export.h> #include <linux/rfkill.h> #include <linux/debugfs.h> #include <linux/crypto.h> #include <linux/kcov.h> #include <linux/property.h> #include <linux/suspend.h> #include <linux/wait.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "hci_debugfs.h" #include "smp.h" #include "leds.h" #include "msft.h" #include "aosp.h" #include "hci_codec.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); static void hci_tx_work(struct work_struct *work); /* HCI device list */ LIST_HEAD(hci_dev_list); DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; BT_DBG("%d", index); if (index < 0) return NULL; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); if (srcu_index) *srcu_index = srcu_read_lock(&d->srcu); break; } } read_unlock(&hci_dev_list_lock); return hdev; } struct hci_dev *hci_dev_get(int index) { return __hci_dev_get(index, NULL); } static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) { return __hci_dev_get(index, srcu_index); } static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) { srcu_read_unlock(&hdev->srcu, srcu_index); hci_dev_put(hdev); } /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) { struct discovery_state *discov = &hdev->discovery; switch (discov->state) { case DISCOVERY_FINDING: case DISCOVERY_RESOLVING: return true; default: return false; } } void hci_discovery_set_state(struct hci_dev *hdev, int state) { int old_state = hdev->discovery.state; if (old_state == state) return; hdev->discovery.state = state; switch (state) { case DISCOVERY_STOPPED: hci_update_passive_scan(hdev); if (old_state != DISCOVERY_STARTING) mgmt_discovering(hdev, 0); break; case DISCOVERY_STARTING: break; case DISCOVERY_FINDING: mgmt_discovering(hdev, 1); break; case DISCOVERY_RESOLVING: break; case DISCOVERY_STOPPING: break; } bt_dev_dbg(hdev, "state %u -> %u", old_state, state); } void hci_inquiry_cache_flush(struct hci_dev *hdev) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *p, *n; list_for_each_entry_safe(p, n, &cache->all, all) { list_del(&p->all); kfree(p); } INIT_LIST_HEAD(&cache->unknown); INIT_LIST_HEAD(&cache->resolve); } struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->all, all) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->unknown, list) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev, bdaddr_t *bdaddr, int state) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p bdaddr %pMR state %d", cache, bdaddr, state); list_for_each_entry(e, &cache->resolve, list) { if (!bacmp(bdaddr, BDADDR_ANY) && e->name_state == state) return e; if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } void hci_inquiry_cache_update_resolve(struct hci_dev *hdev, struct inquiry_entry *ie) { struct discovery_state *cache = &hdev->discovery; struct list_head *pos = &cache->resolve; struct inquiry_entry *p; list_del(&ie->list); list_for_each_entry(p, &cache->resolve, list) { if (p->name_state != NAME_PENDING && abs(p->data.rssi) >= abs(ie->data.rssi)) break; pos = &p->list; } list_add(&ie->list, pos); } u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, bool name_known) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *ie; u32 flags = 0; BT_DBG("cache %p, %pMR", cache, &data->bdaddr); hci_remove_remote_oob_data(hdev, &data->bdaddr, BDADDR_BREDR); if (!data->ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); if (ie) { if (!ie->data.ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; if (ie->name_state == NAME_NEEDED && data->rssi != ie->data.rssi) { ie->data.rssi = data->rssi; hci_inquiry_cache_update_resolve(hdev, ie); } goto update; } /* Entry not in the cache. Add new one. */ ie = kzalloc(sizeof(*ie), GFP_KERNEL); if (!ie) { flags |= MGMT_DEV_FOUND_CONFIRM_NAME; goto done; } list_add(&ie->all, &cache->all); if (name_known) { ie->name_state = NAME_KNOWN; } else { ie->name_state = NAME_NOT_KNOWN; list_add(&ie->list, &cache->unknown); } update: if (name_known && ie->name_state != NAME_KNOWN && ie->name_state != NAME_PENDING) { ie->name_state = NAME_KNOWN; list_del(&ie->list); } memcpy(&ie->data, data, sizeof(*data)); ie->timestamp = jiffies; cache->timestamp = jiffies; if (ie->name_state == NAME_NOT_KNOWN) flags |= MGMT_DEV_FOUND_CONFIRM_NAME; done: return flags; } static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) { struct discovery_state *cache = &hdev->discovery; struct inquiry_info *info = (struct inquiry_info *) buf; struct inquiry_entry *e; int copied = 0; list_for_each_entry(e, &cache->all, all) { struct inquiry_data *data = &e->data; if (copied >= num) break; bacpy(&info->bdaddr, &data->bdaddr); info->pscan_rep_mode = data->pscan_rep_mode; info->pscan_period_mode = data->pscan_period_mode; info->pscan_mode = data->pscan_mode; memcpy(info->dev_class, data->dev_class, 3); info->clock_offset = data->clock_offset; info++; copied++; } BT_DBG("cache %p, copied %d", cache, copied); return copied; } int hci_inquiry(void __user *arg) { __u8 __user *ptr = arg; struct hci_inquiry_req ir; struct hci_dev *hdev; int err = 0, do_inquiry = 0, max_rsp; __u8 *buf; if (copy_from_user(&ir, ptr, sizeof(ir))) return -EFAULT; hdev = hci_dev_get(ir.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } /* Restrict maximum inquiry length to 60 seconds */ if (ir.length > 60) { err = -EINVAL; goto done; } hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { hci_inquiry_cache_flush(hdev); do_inquiry = 1; } hci_dev_unlock(hdev); if (do_inquiry) { hci_req_sync_lock(hdev); err = hci_inquiry_sync(hdev, ir.length, ir.num_rsp); hci_req_sync_unlock(hdev); if (err < 0) goto done; /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is * cleared). If it is interrupted by a signal, return -EINTR. */ if (wait_on_bit(&hdev->flags, HCI_INQUIRY, TASK_INTERRUPTIBLE)) { err = -EINTR; goto done; } } /* for unlimited number of responses we will use buffer with * 255 entries */ max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp; /* cache_dump can't sleep. Therefore we allocate temp buffer and then * copy it to the user space. */ buf = kmalloc_array(max_rsp, sizeof(struct inquiry_info), GFP_KERNEL); if (!buf) { err = -ENOMEM; goto done; } hci_dev_lock(hdev); ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf); hci_dev_unlock(hdev); BT_DBG("num_rsp %d", ir.num_rsp); if (!copy_to_user(ptr, &ir, sizeof(ir))) { ptr += sizeof(ir); if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) * ir.num_rsp)) err = -EFAULT; } else err = -EFAULT; kfree(buf); done: hci_dev_put(hdev); return err; } static int hci_dev_do_open(struct hci_dev *hdev) { int ret = 0; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); ret = hci_dev_open_sync(hdev); hci_req_sync_unlock(hdev); return ret; } /* ---- HCI ioctl helpers ---- */ int hci_dev_open(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; /* Devices that are marked as unconfigured can only be powered * up as user channel. Trying to bring them up as normal devices * will result into a failure. Only user channel operation is * possible. * * When this function is called for a user channel, the flag * HCI_USER_CHANNEL will be set first before attempting to * open the device. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EOPNOTSUPP; goto done; } /* We need to ensure that no other power on/off work is pending * before proceeding to call hci_dev_do_open. This is * particularly important if the setup procedure has not yet * completed. */ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); /* After this call it is guaranteed that the setup procedure * has finished. This means that error conditions like RFKILL * or no valid public or static random address apply. */ flush_workqueue(hdev->req_workqueue); /* For controllers not using the management interface and that * are brought up using legacy ioctl, set the HCI_BONDABLE bit * so that pairing works for them. Once the management interface * is in use this bit will be cleared again and userspace has * to explicitly enable it. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !hci_dev_test_flag(hdev, HCI_MGMT)) hci_dev_set_flag(hdev, HCI_BONDABLE); err = hci_dev_do_open(hdev); done: hci_dev_put(hdev); return err; } int hci_dev_do_close(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_dev_close_sync(hdev); hci_req_sync_unlock(hdev); return err; } int hci_dev_close(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); err = hci_dev_do_close(hdev); done: hci_dev_put(hdev); return err; } static int hci_dev_do_reset(struct hci_dev *hdev) { int ret; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); /* Cancel these to avoid queueing non-chained pending work */ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); /* Wait for * * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) * queue_delayed_work(&hdev->{cmd,ncmd}_timer) * * inside RCU section to see the flag or complete scheduling. */ synchronize_rcu(); /* Explicitly cancel works in case scheduled after setting the flag. */ cancel_delayed_work(&hdev->cmd_timer); cancel_delayed_work(&hdev->ncmd_timer); /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_conn_hash_flush(hdev); hci_dev_unlock(hdev); if (hdev->flush) hdev->flush(hdev); hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; hdev->iso_cnt = 0; ret = hci_reset_sync(hdev); hci_req_sync_unlock(hdev); return ret; } int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; int err, srcu_index; hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto done; } if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } err = hci_dev_do_reset(hdev); done: hci_dev_put_srcu(hdev, srcu_index); return err; } int hci_dev_reset_stat(__u16 dev) { struct hci_dev *hdev; int ret = 0; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { ret = -EOPNOTSUPP; goto done; } memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); done: hci_dev_put(hdev); return ret; } static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan) { bool conn_changed, discov_changed; BT_DBG("%s scan 0x%02x", hdev->name, scan); if ((scan & SCAN_PAGE)) conn_changed = !hci_dev_test_and_set_flag(hdev, HCI_CONNECTABLE); else conn_changed = hci_dev_test_and_clear_flag(hdev, HCI_CONNECTABLE); if ((scan & SCAN_INQUIRY)) { discov_changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE); } else { hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); discov_changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE); } if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; if (conn_changed || discov_changed) { /* In case this was disabled through mgmt */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) hci_update_adv_data(hdev, hdev->cur_adv_instance); mgmt_new_settings(hdev); } } int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; struct hci_dev_req dr; __le16 policy; int err = 0; if (copy_from_user(&dr, arg, sizeof(dr))) return -EFAULT; hdev = hci_dev_get(dr.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } switch (cmd) { case HCISETAUTH: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETENCRYPT: if (!lmp_encrypt_capable(hdev)) { err = -EOPNOTSUPP; break; } if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); if (err) break; } err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETSCAN: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. */ if (!err) hci_update_passive_scan_state(hdev, dr.dev_opt); break; case HCISETLINKPOL: policy = cpu_to_le16(dr.dev_opt); err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy, HCI_CMD_TIMEOUT); break; case HCISETLINKMODE: hdev->link_mode = ((__u16) dr.dev_opt) & (HCI_LM_MASTER | HCI_LM_ACCEPT); break; case HCISETPTYPE: if (hdev->pkt_type == (__u16) dr.dev_opt) break; hdev->pkt_type = (__u16) dr.dev_opt; mgmt_phy_configuration_changed(hdev, NULL); break; case HCISETACLMTU: hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0); break; case HCISETSCOMTU: hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0); break; default: err = -EINVAL; break; } done: hci_dev_put(hdev); return err; } int hci_get_dev_list(void __user *arg) { struct hci_dev *hdev; struct hci_dev_list_req *dl; struct hci_dev_req *dr; int n = 0, err; __u16 dev_num; if (get_user(dev_num, (__u16 __user *) arg)) return -EFAULT; if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) return -EINVAL; dl = kzalloc(struct_size(dl, dev_req, dev_num), GFP_KERNEL); if (!dl) return -ENOMEM; dl->dev_num = dev_num; dr = dl->dev_req; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { unsigned long flags = hdev->flags; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags &= ~BIT(HCI_UP); dr[n].dev_id = hdev->id; dr[n].dev_opt = flags; if (++n >= dev_num) break; } read_unlock(&hci_dev_list_lock); dl->dev_num = n; err = copy_to_user(arg, dl, struct_size(dl, dev_req, n)); kfree(dl); return err ? -EFAULT : 0; } int hci_get_dev_info(void __user *arg) { struct hci_dev *hdev; struct hci_dev_info di; unsigned long flags; int err = 0; if (copy_from_user(&di, arg, sizeof(di))) return -EFAULT; hdev = hci_dev_get(di.dev_id); if (!hdev) return -ENODEV; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags = hdev->flags & ~BIT(HCI_UP); else flags = hdev->flags; strscpy(di.name, hdev->name, sizeof(di.name)); di.bdaddr = hdev->bdaddr; di.type = (hdev->bus & 0x0f); di.flags = flags; di.pkt_type = hdev->pkt_type; if (lmp_bredr_capable(hdev)) { di.acl_mtu = hdev->acl_mtu; di.acl_pkts = hdev->acl_pkts; di.sco_mtu = hdev->sco_mtu; di.sco_pkts = hdev->sco_pkts; } else { di.acl_mtu = hdev->le_mtu; di.acl_pkts = hdev->le_pkts; di.sco_mtu = 0; di.sco_pkts = 0; } di.link_policy = hdev->link_policy; di.link_mode = hdev->link_mode; memcpy(&di.stat, &hdev->stat, sizeof(di.stat)); memcpy(&di.features, &hdev->features, sizeof(di.features)); if (copy_to_user(arg, &di, sizeof(di))) err = -EFAULT; hci_dev_put(hdev); return err; } /* ---- Interface to HCI drivers ---- */ static int hci_dev_do_poweroff(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_set_powered_sync(hdev, false); hci_req_sync_unlock(hdev); return err; } static int hci_rfkill_set_block(void *data, bool blocked) { struct hci_dev *hdev = data; int err; BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED)) return 0; if (blocked) { hci_dev_set_flag(hdev, HCI_RFKILLED); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { err = hci_dev_do_poweroff(hdev); if (err) { bt_dev_err(hdev, "Error when powering off device on rfkill (%d)", err); /* Make sure the device is still closed even if * anything during power off sequence (eg. * disconnecting devices) failed. */ hci_dev_do_close(hdev); } } } else { hci_dev_clear_flag(hdev, HCI_RFKILLED); } return 0; } static const struct rfkill_ops hci_rfkill_ops = { .set_block = hci_rfkill_set_block, }; static void hci_power_on(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_on); int err; BT_DBG("%s", hdev->name); if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); err = hci_powered_update_sync(hdev); mgmt_power_on(hdev, err); return; } err = hci_dev_do_open(hdev); if (err < 0) { hci_dev_lock(hdev); mgmt_set_powered_failed(hdev, err); hci_dev_unlock(hdev); return; } /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to turn the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_do_close(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } } static void hci_power_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_off.work); BT_DBG("%s", hdev->name); hci_dev_do_close(hdev); } static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); hci_dev_hold(hdev); BT_DBG("%s", hdev->name); if (hdev->hw_error) hdev->hw_error(hdev, hdev->hw_error_code); else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); if (!hci_dev_do_close(hdev)) hci_dev_do_open(hdev); hci_dev_put(hdev); } void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) { list_del(&uuid->list); kfree(uuid); } } void hci_link_keys_clear(struct hci_dev *hdev) { struct link_key *key, *tmp; list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } void hci_smp_ltks_clear(struct hci_dev *hdev) { struct smp_ltk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_smp_irks_clear(struct hci_dev *hdev) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_blocked_keys_clear(struct hci_dev *hdev) { struct blocked_key *b, *tmp; list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } } bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) { bool blocked = false; struct blocked_key *b; rcu_read_lock(); list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { blocked = true; break; } } rcu_read_unlock(); return blocked; } struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->link_keys, list) { if (bacmp(bdaddr, &k->bdaddr) == 0) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LINKKEY, k->val)) { bt_dev_warn_ratelimited(hdev, "Link key blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, u8 key_type, u8 old_key_type) { /* Legacy key */ if (key_type < 0x03) return true; /* Debug keys are insecure so don't store them persistently */ if (key_type == HCI_LK_DEBUG_COMBINATION) return false; /* Changed combination key and there's no previous one */ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff) return false; /* Security mode 3 case */ if (!conn) return true; /* BR/EDR key derived using SC from an LE link */ if (conn->type == LE_LINK) return true; /* Neither local nor remote side had no-bonding as requirement */ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01) return true; /* Local side had dedicated bonding as requirement */ if (conn->auth_type == 0x02 || conn->auth_type == 0x03) return true; /* Remote side had dedicated bonding as requirement */ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) return true; /* If none of the above criteria match, then don't store the key * persistently */ return false; } static u8 ltk_role(u8 type) { if (type == SMP_LTK) return HCI_ROLE_MASTER; return HCI_ROLE_SLAVE; } struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 role) { struct smp_ltk *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (addr_type != k->bdaddr_type || bacmp(bdaddr, &k->bdaddr)) continue; if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, k->val)) { bt_dev_warn_ratelimited(hdev, "LTK blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (!bacmp(&irk->rpa, rpa)) { irk_to_return = irk; goto done; } } list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); irk_to_return = irk; goto done; } } done: if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; /* Identity Address must be public or static random */ if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0) return NULL; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { irk_to_return = irk; break; } } if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len, bool *persistent) { struct link_key *key, *old_key; u8 old_key_type; old_key = hci_find_link_key(hdev, bdaddr); if (old_key) { old_key_type = old_key->type; key = old_key; } else { old_key_type = conn ? conn->key_type : 0xff; key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->link_keys); } BT_DBG("%s key for %pMR type %u", hdev->name, bdaddr, type); /* Some buggy controller combinations generate a changed * combination key for legacy pairing even when there's no * previous key */ if (type == HCI_LK_CHANGED_COMBINATION && (!conn || conn->remote_auth == 0xff) && old_key_type == 0xff) { type = HCI_LK_COMBINATION; if (conn) conn->key_type = type; } bacpy(&key->bdaddr, bdaddr); memcpy(key->val, val, HCI_LINK_KEY_SIZE); key->pin_len = pin_len; if (type == HCI_LK_CHANGED_COMBINATION) key->type = old_key_type; else key->type = type; if (persistent) *persistent = hci_persistent_key(hdev, conn, type, old_key_type); return key; } struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, u8 authenticated, u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand) { struct smp_ltk *key, *old_key; u8 role = ltk_role(type); old_key = hci_find_ltk(hdev, bdaddr, addr_type, role); if (old_key) key = old_key; else { key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->long_term_keys); } bacpy(&key->bdaddr, bdaddr); key->bdaddr_type = addr_type; memcpy(key->val, tk, sizeof(key->val)); key->authenticated = authenticated; key->ediv = ediv; key->rand = rand; key->enc_size = enc_size; key->type = type; return key; } struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa) { struct smp_irk *irk; irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type); if (!irk) { irk = kzalloc(sizeof(*irk), GFP_KERNEL); if (!irk) return NULL; bacpy(&irk->bdaddr, bdaddr); irk->addr_type = addr_type; list_add_rcu(&irk->list, &hdev->identity_resolving_keys); } memcpy(irk->val, val, 16); bacpy(&irk->rpa, rpa); return irk; } int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *key; key = hci_find_link_key(hdev, bdaddr); if (!key) return -ENOENT; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&key->list); kfree_rcu(key, rcu); return 0; } int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct smp_ltk *k, *tmp; int removed = 0; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); removed++; } return removed ? 0 : -ENOENT; } void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); } } bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) { struct smp_ltk *k; struct smp_irk *irk; u8 addr_type; if (type == BDADDR_BREDR) { if (hci_find_link_key(hdev, bdaddr)) return true; return false; } /* Convert to HCI addr type which struct smp_ltk uses */ if (type == BDADDR_LE_PUBLIC) addr_type = ADDR_LE_DEV_PUBLIC; else addr_type = ADDR_LE_DEV_RANDOM; irk = hci_get_irk(hdev, bdaddr, addr_type); if (irk) { bdaddr = &irk->bdaddr; addr_type = irk->addr_type; } rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* HCI command timer function */ static void hci_cmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_timer.work); if (hdev->req_skb) { u16 opcode = hci_skb_opcode(hdev->req_skb); bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode); hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT); } else { bt_dev_err(hdev, "command tx timeout"); } if (hdev->reset) hdev->reset(hdev); atomic_set(&hdev->cmd_cnt, 1); queue_work(hdev->workqueue, &hdev->cmd_work); } /* HCI ncmd timer function */ static void hci_ncmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, ncmd_timer.work); bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0"); /* During HCI_INIT phase no events can be injected if the ncmd timer * triggers since the procedure has its own timeout handling. */ if (test_bit(HCI_INIT, &hdev->flags)) return; /* This is an irrecoverable state, inject hardware error event */ hci_reset_dev(hdev); } struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; list_for_each_entry(data, &hdev->remote_oob_data, list) { if (bacmp(bdaddr, &data->bdaddr) != 0) continue; if (data->bdaddr_type != bdaddr_type) continue; return data; } return NULL; } int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) return -ENOENT; BT_DBG("%s removing %pMR (%u)", hdev->name, bdaddr, bdaddr_type); list_del(&data->list); kfree(data); return 0; } void hci_remote_oob_data_clear(struct hci_dev *hdev) { struct oob_data *data, *n; list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) { list_del(&data->list); kfree(data); } } int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 *hash192, u8 *rand192, u8 *hash256, u8 *rand256) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) { data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; bacpy(&data->bdaddr, bdaddr); data->bdaddr_type = bdaddr_type; list_add(&data->list, &hdev->remote_oob_data); } if (hash192 && rand192) { memcpy(data->hash192, hash192, sizeof(data->hash192)); memcpy(data->rand192, rand192, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x03; } else { memset(data->hash192, 0, sizeof(data->hash192)); memset(data->rand192, 0, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x02; else data->present = 0x00; } if (hash256 && rand256) { memcpy(data->hash256, hash256, sizeof(data->hash256)); memcpy(data->rand256, rand256, sizeof(data->rand256)); } else { memset(data->hash256, 0, sizeof(data->hash256)); memset(data->rand256, 0, sizeof(data->rand256)); if (hash192 && rand192) data->present = 0x01; } BT_DBG("%s for %pMR", hdev->name, bdaddr); return 0; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; list_for_each_entry(adv_instance, &hdev->adv_instances, list) { if (adv_instance->instance == instance) return adv_instance; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid) { struct adv_info *adv; list_for_each_entry(adv, &hdev->adv_instances, list) { if (adv->sid == sid) return adv; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *cur_instance; cur_instance = hci_find_adv_instance(hdev, instance); if (!cur_instance) return NULL; if (cur_instance == list_last_entry(&hdev->adv_instances, struct adv_info, list)) return list_first_entry(&hdev->adv_instances, struct adv_info, list); else return list_next_entry(cur_instance, list); } /* This function requires the caller holds hdev->lock */ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) return -ENOENT; BT_DBG("%s removing %dMR", hdev->name, instance); if (hdev->cur_adv_instance == instance) { if (hdev->adv_instance_timeout) { cancel_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } hdev->cur_adv_instance = 0x00; } cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); hdev->adv_instance_cnt--; return 0; } void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired) { struct adv_info *adv_instance, *n; list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) adv_instance->rpa_expired = rpa_expired; } /* This function requires the caller holds hdev->lock */ void hci_adv_instances_clear(struct hci_dev *hdev) { struct adv_info *adv_instance, *n; if (hdev->adv_instance_timeout) { disable_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { disable_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; } static void adv_instance_rpa_expired(struct work_struct *work) { struct adv_info *adv_instance = container_of(work, struct adv_info, rpa_expired_cb.work); BT_DBG(""); adv_instance->rpa_expired = true; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); if (adv) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data)); } else { if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || instance < 1 || instance > hdev->le_num_of_adv_sets + 1) return ERR_PTR(-EOVERFLOW); adv = kzalloc(sizeof(*adv), GFP_KERNEL); if (!adv) return ERR_PTR(-ENOMEM); adv->pending = true; adv->instance = instance; /* If controller support only one set and the instance is set to * 1 then there is no option other than using handle 0x00. */ if (hdev->le_num_of_adv_sets == 1 && instance == 1) adv->handle = 0x00; else adv->handle = instance; list_add(&adv->list, &hdev->adv_instances); hdev->adv_instance_cnt++; } adv->flags = flags; adv->min_interval = min_interval; adv->max_interval = max_interval; adv->tx_power = tx_power; /* Defining a mesh_handle changes the timing units to ms, * rather than seconds, and ties the instance to the requested * mesh_tx queue. */ adv->mesh = mesh_handle; hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data, scan_rsp_len, scan_rsp_data); adv->timeout = timeout; adv->remaining_time = timeout; if (duration == 0) adv->duration = hdev->def_multi_adv_rotation_duration; else adv->duration = duration; INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired); BT_DBG("%s for %dMR", hdev->name, instance); return adv; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval) { struct adv_info *adv; adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL, 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE, min_interval, max_interval, 0); if (IS_ERR(adv)) return adv; adv->sid = sid; adv->periodic = true; adv->per_adv_data_len = data_len; if (data) memcpy(adv->per_adv_data, data, data_len); return adv; } /* This function requires the caller holds hdev->lock */ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); /* If advertisement doesn't exist, we can't modify its data */ if (!adv) return -ENOENT; if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memcpy(adv->adv_data, adv_data, adv_data_len); adv->adv_data_len = adv_data_len; adv->adv_data_changed = true; } if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) { memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len); adv->scan_rsp_len = scan_rsp_len; adv->scan_rsp_changed = true; } /* Mark as changed if there are flags which would affect it */ if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) adv->scan_rsp_changed = true; return 0; } /* This function requires the caller holds hdev->lock */ u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance) { u32 flags; struct adv_info *adv; if (instance == 0x00) { /* Instance 0 always manages the "Tx Power" and "Flags" * fields */ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting * corresponds to the "connectable" instance flag. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) flags |= MGMT_ADV_FLAG_CONNECTABLE; if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_DISCOV; return flags; } adv = hci_find_adv_instance(hdev, instance); /* Return 0 when we got an invalid instance identifier. */ if (!adv) return 0; return adv->flags; } bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) { struct adv_info *adv; /* Instance 0x00 always set local name */ if (instance == 0x00) return true; adv = hci_find_adv_instance(hdev, instance); if (!adv) return false; if (adv->flags & MGMT_ADV_FLAG_APPEARANCE || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) return true; return adv->scan_rsp_len ? true : false; } /* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; int handle; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) hci_free_adv_monitor(hdev, monitor); idr_destroy(&hdev->adv_monitors_idr); } /* Frees the monitor structure and do some bookkeepings. * This function requires the caller holds hdev->lock. */ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct adv_pattern *pattern; struct adv_pattern *tmp; if (!monitor) return; list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) { list_del(&pattern->list); kfree(pattern); } if (monitor->handle) idr_remove(&hdev->adv_monitors_idr, monitor->handle); if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) hdev->adv_monitors_cnt--; kfree(monitor); } /* Assigns handle to a monitor, and if offloading is supported and power is on, * also attempts to forward the request to the controller. * This function requires the caller holds hci_req_sync_lock. */ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int min, max, handle; int status = 0; if (!monitor) return -EINVAL; hci_dev_lock(hdev); min = HCI_MIN_ADV_MONITOR_HANDLE; max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, GFP_KERNEL); hci_dev_unlock(hdev); if (handle < 0) return handle; monitor->handle = handle; if (!hdev_is_powered(hdev)) return status; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: bt_dev_dbg(hdev, "add monitor %d status %d", monitor->handle, status); /* Message was not forwarded to controller - not an error */ break; case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); bt_dev_dbg(hdev, "add monitor %d msft status %d", handle, status); break; } return status; } /* Attempts to tell the controller and free the monitor. If somehow the * controller doesn't have a corresponding handle, remove anyway. * This function requires the caller holds hci_req_sync_lock. */ static int hci_remove_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int status = 0; int handle; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ bt_dev_dbg(hdev, "remove monitor %d status %d", monitor->handle, status); goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); bt_dev_dbg(hdev, "remove monitor %d msft status %d", handle, status); break; } /* In case no matching handle registered, just free the monitor */ if (status == -ENOENT) goto free_monitor; return status; free_monitor: if (status == -ENOENT) bt_dev_warn(hdev, "Removing monitor with no matching handle %d", monitor->handle); hci_free_adv_monitor(hdev, monitor); return status; } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle) { struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle); if (!monitor) return -EINVAL; return hci_remove_adv_monitor(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_all_adv_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; int idr_next_id = 0; int status = 0; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id); if (!monitor) break; status = hci_remove_adv_monitor(hdev, monitor); if (status) return status; idr_next_id++; } return status; } /* This function requires the caller holds hdev->lock */ bool hci_is_adv_monitoring(struct hci_dev *hdev) { return !idr_is_empty(&hdev->adv_monitors_idr); } int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev) { if (msft_monitor_supported(hdev)) return HCI_ADV_MONITOR_EXT_MSFT; return HCI_ADV_MONITOR_EXT_NONE; } struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_flags * hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_flags *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; list_for_each_entry_safe(b, n, bdaddr_list, list) { list_del(&b->list); kfree(b); } } int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type, u8 *peer_irk, u8 *local_irk) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; if (peer_irk) memcpy(entry->peer_irk, peer_irk, 16); if (local_irk) memcpy(entry->local_irk, local_irk, 16); list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type, u32 flags) { struct bdaddr_list_with_flags *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; entry->flags = flags; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(&params->addr, addr) == 0 && params->addr_type == addr_type) { return params; } } return NULL; } /* This function requires the caller holds hdev->lock or rcu_read_lock */ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *param; rcu_read_lock(); list_for_each_entry_rcu(param, list, action) { if (bacmp(&param->addr, addr) == 0 && param->addr_type == addr_type) { rcu_read_unlock(); return param; } } rcu_read_unlock(); return NULL; } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_del_init(struct hci_conn_params *param) { if (list_empty(&param->action)) return; list_del_rcu(&param->action); synchronize_rcu(); INIT_LIST_HEAD(&param->action); } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_add(struct hci_conn_params *param, struct list_head *list) { list_add_rcu(&param->action, list); } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) return params; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) { bt_dev_err(hdev, "out of memory"); return NULL; } bacpy(&params->addr, addr); params->addr_type = addr_type; list_add(&params->list, &hdev->le_conn_params); INIT_LIST_HEAD(&params->action); params->conn_min_interval = hdev->le_conn_min_interval; params->conn_max_interval = hdev->le_conn_max_interval; params->conn_latency = hdev->le_conn_latency; params->supervision_timeout = hdev->le_supv_timeout; params->auto_connect = HCI_AUTO_CONN_DISABLED; BT_DBG("addr %pMR (type %u)", addr, addr_type); return params; } void hci_conn_params_free(struct hci_conn_params *params) { hci_pend_le_list_del_init(params); if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); } list_del(&params->list); kfree(params); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) return; hci_conn_params_free(params); hci_update_passive_scan(hdev); BT_DBG("addr %pMR (type %u)", addr, addr_type); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_clear_disabled(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { params->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } hci_conn_params_free(params); } BT_DBG("All LE disabled connection parameters were removed"); } /* This function requires the caller holds hdev->lock */ static void hci_conn_params_clear_all(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) hci_conn_params_free(params); BT_DBG("All LE connection parameters were removed"); } /* Copy the Identity Address of the controller. * * If the controller has a public BD_ADDR, then by default use that one. * If this is a LE only controller without a public address, default to * the static random address. * * For debugging purposes it is possible to force controllers with a * public address to use the static random address instead. * * In case BR/EDR has been disabled on a dual-mode controller and * userspace has configured a static address, then that address * becomes the identity address instead of the public BR/EDR address. */ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type) { if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { bacpy(bdaddr, &hdev->static_addr); *bdaddr_type = ADDR_LE_DEV_RANDOM; } else { bacpy(bdaddr, &hdev->bdaddr); *bdaddr_type = ADDR_LE_DEV_PUBLIC; } } static void hci_clear_wake_reason(struct hci_dev *hdev) { hci_dev_lock(hdev); hdev->wake_reason = 0; bacpy(&hdev->wake_addr, BDADDR_ANY); hdev->wake_addr_type = 0; hci_dev_unlock(hdev); } static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct hci_dev *hdev = container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; /* Userspace has full control of this device. Do nothing. */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return NOTIFY_DONE; /* To avoid a potential race with hci_unregister_dev. */ hci_dev_hold(hdev); switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: ret = hci_suspend_dev(hdev); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: ret = hci_resume_dev(hdev); break; } if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); hci_dev_put(hdev); return NOTIFY_DONE; } /* Alloc HCI device */ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) { struct hci_dev *hdev; unsigned int alloc_size; alloc_size = sizeof(*hdev); if (sizeof_priv) { /* Fixme: May need ALIGN-ment? */ alloc_size += sizeof_priv; } hdev = kzalloc(alloc_size, GFP_KERNEL); if (!hdev) return NULL; if (init_srcu_struct(&hdev->srcu)) { kfree(hdev); return NULL; } hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); hdev->num_iac = 0x01; /* One IAC support is mandatory */ hdev->io_capability = 0x03; /* No Input No Output */ hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; hdev->adv_instance_timeout = 0; hdev->advmon_allowlist_duration = 300; hdev->advmon_no_filter_duration = 500; hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */ hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; hdev->le_adv_channel_map = 0x07; hdev->le_adv_min_interval = 0x0800; hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1; hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1; hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN; hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; hdev->le_supv_timeout = 0x002a; hdev->le_def_tx_len = 0x001b; hdev->le_def_tx_time = 0x0148; hdev->le_max_tx_len = 0x001b; hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE; hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT; hdev->min_le_tx_power = HCI_TX_POWER_INVALID; hdev->max_le_tx_power = HCI_TX_POWER_INVALID; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; /* default 1.28 sec page scan */ hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD; hdev->def_page_scan_int = 0x0800; hdev->def_page_scan_window = 0x0012; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); mutex_init(&hdev->mgmt_pending_lock); ida_init(&hdev->unset_handle_ida); INIT_LIST_HEAD(&hdev->mesh_pending); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->reject_list); INIT_LIST_HEAD(&hdev->accept_list); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); INIT_LIST_HEAD(&hdev->le_accept_list); INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); INIT_LIST_HEAD(&hdev->monitored_devices); INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); INIT_WORK(&hdev->tx_work, hci_tx_work); INIT_WORK(&hdev->power_on, hci_power_on); INIT_WORK(&hdev->error_reset, hci_error_reset); hci_cmd_sync_init(hdev); INIT_DELAYED_WORK(&hdev->power_off, hci_power_off); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); skb_queue_head_init(&hdev->raw_q); init_waitqueue_head(&hdev->req_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout); hci_devcd_setup(hdev); hci_init_sysfs(hdev); discovery_init(hdev); return hdev; } EXPORT_SYMBOL(hci_alloc_dev_priv); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) { /* will free via device release */ put_device(&hdev->dev); } EXPORT_SYMBOL(hci_free_dev); /* Register HCI device */ int hci_register_dev(struct hci_dev *hdev) { int id, error; if (!hdev->open || !hdev->close || !hdev->send) return -EINVAL; id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL); if (id < 0) return id; error = dev_set_name(&hdev->dev, "hci%u", id); if (error) return error; hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); hdev->workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->workqueue) { error = -ENOMEM; goto err; } hdev->req_workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->req_workqueue) { destroy_workqueue(hdev->workqueue); error = -ENOMEM; goto err; } if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; hci_leds_init(hdev); hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); if (hdev->rfkill) { if (rfkill_register(hdev->rfkill) < 0) { rfkill_destroy(hdev->rfkill); hdev->rfkill = NULL; } } if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) hci_dev_set_flag(hdev, HCI_RFKILLED); hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); write_lock(&hci_dev_list_lock); list_add(&hdev->list, &hci_dev_list); write_unlock(&hci_dev_list_lock); /* Devices that are marked for raw-only usage are unconfigured * and should not be included in normal operation. */ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* Mark Remote Wakeup connection flag as supported if driver has wakeup * callback. */ if (hdev->wakeup) hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP; hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); error = hci_register_suspend_notifier(hdev); if (error) BT_WARN("register suspend notifier failed error:%d\n", error); queue_work(hdev->req_workqueue, &hdev->power_on); idr_init(&hdev->adv_monitors_idr); msft_register(hdev); return id; err_wqueue: debugfs_remove_recursive(hdev->debugfs); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: ida_free(&hci_index_ida, hdev->id); return error; } EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); mutex_lock(&hdev->unregister_lock); hci_dev_set_flag(hdev, HCI_UNREGISTER); mutex_unlock(&hdev->unregister_lock); write_lock(&hci_dev_list_lock); list_del(&hdev->list); write_unlock(&hci_dev_list_lock); synchronize_srcu(&hdev->srcu); cleanup_srcu_struct(&hdev->srcu); disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); disable_work_sync(&hdev->power_on); disable_work_sync(&hdev->error_reset); hci_cmd_sync_clear(hdev); hci_unregister_suspend_notifier(hdev); hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); } /* mgmt_index_removed should take care of emptying the * pending list */ BUG_ON(!list_empty(&hdev->mgmt_pending)); hci_sock_dev_event(hdev, HCI_DEV_UNREG); if (hdev->rfkill) { rfkill_unregister(hdev->rfkill); rfkill_destroy(hdev->rfkill); } device_del(&hdev->dev); /* Actual cleanup is deferred until hci_release_dev(). */ hci_dev_put(hdev); } EXPORT_SYMBOL(hci_unregister_dev); /* Release HCI device */ void hci_release_dev(struct hci_dev *hdev) { debugfs_remove_recursive(hdev->debugfs); kfree_const(hdev->hw_info); kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); hci_bdaddr_list_clear(&hdev->reject_list); hci_bdaddr_list_clear(&hdev->accept_list); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_adv_monitors_clear(hdev); hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); hci_codec_list_clear(&hdev->local_codecs); msft_release(hdev); hci_dev_unlock(hdev); ida_destroy(&hdev->unset_handle_ida); ida_free(&hci_index_ida, hdev->id); kfree_skb(hdev->sent_cmd); kfree_skb(hdev->req_skb); kfree_skb(hdev->recv_event); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); int hci_register_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (!hdev->suspend_notifier.notifier_call && !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } return ret; } int hci_unregister_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (hdev->suspend_notifier.notifier_call) { ret = unregister_pm_notifier(&hdev->suspend_notifier); if (!ret) hdev->suspend_notifier.notifier_call = NULL; } return ret; } /* Cancel ongoing command synchronously: * * - Cancel command timer * - Reset command counter * - Cancel command request */ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work_sync(&hdev->cmd_timer); disable_delayed_work_sync(&hdev->ncmd_timer); } else { cancel_delayed_work_sync(&hdev->cmd_timer); cancel_delayed_work_sync(&hdev->ncmd_timer); } atomic_set(&hdev->cmd_cnt, 1); hci_cmd_sync_cancel_sync(hdev, err); } /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Suspend should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to suspend */ if (mgmt_powering_down(hdev)) return 0; /* Cancel potentially blocking sync operation before suspend */ hci_cancel_cmd_sync(hdev, EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); hci_req_sync_unlock(hdev); hci_clear_wake_reason(hdev); mgmt_suspending(hdev, hdev->suspend_state); hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); return ret; } EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Resume should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to resume */ if (mgmt_powering_down(hdev)) return 0; hci_req_sync_lock(hdev); ret = hci_resume_sync(hdev); hci_req_sync_unlock(hdev); mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, hdev->wake_addr_type); hci_sock_dev_event(hdev, HCI_DEV_RESUME); return ret; } EXPORT_SYMBOL(hci_resume_dev); /* Reset HCI device */ int hci_reset_dev(struct hci_dev *hdev) { static const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 }; struct sk_buff *skb; skb = bt_skb_alloc(3, GFP_ATOMIC); if (!skb) return -ENOMEM; hci_skb_pkt_type(skb) = HCI_EVENT_PKT; skb_put_data(skb, hw_err, 3); bt_dev_err(hdev, "Injecting HCI hardware error event"); /* Send Hardware Error to upper stack */ return hci_recv_frame(hdev, skb); } EXPORT_SYMBOL(hci_reset_dev); static u8 hci_dev_classify_pkt_type(struct hci_dev *hdev, struct sk_buff *skb) { if (hdev->classify_pkt_type) return hdev->classify_pkt_type(hdev, skb); return hci_skb_pkt_type(skb); } /* Receive frame from HCI drivers */ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) { u8 dev_pkt_type; if (!hdev || (!test_bit(HCI_UP, &hdev->flags) && !test_bit(HCI_INIT, &hdev->flags))) { kfree_skb(skb); return -ENXIO; } /* Check if the driver agree with packet type classification */ dev_pkt_type = hci_dev_classify_pkt_type(hdev, skb); if (hci_skb_pkt_type(skb) != dev_pkt_type) { hci_skb_pkt_type(skb) = dev_pkt_type; } switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ if (hci_conn_num(hdev, CIS_LINK) || hci_conn_num(hdev, BIS_LINK) || hci_conn_num(hdev, PA_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); if (type == CIS_LINK || type == BIS_LINK || type == PA_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; case HCI_SCODATA_PKT: break; case HCI_ISODATA_PKT: break; case HCI_DRV_PKT: break; default: kfree_skb(skb); return -EINVAL; } /* Incoming skb */ bt_cb(skb)->incoming = 1; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_frame); /* Receive diagnostic message from HCI drivers */ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) { /* Mark as diagnostic packet */ hci_skb_pkt_type(skb) = HCI_DIAG_PKT; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_diag); void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->hw_info); hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_hw_info); void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->fw_info); hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_fw_info); /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_add_tail(&cb->list, &hci_cb_list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_register_cb); int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_del(&cb->list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_unregister_cb); static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { int err; BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb), skb->len); /* Time stamp */ __net_timestamp(skb); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) { kfree_skb(skb); return -EINVAL; } if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) { /* Intercept HCI Drv packet here and don't go with hdev->send * callback. */ err = hci_drv_process_cmd(hdev, skb); kfree_skb(skb); return err; } err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); kfree_skb(skb); return err; } return 0; } static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn, struct sk_buff *skb) { hci_conn_tx_queue(conn, skb); return hci_send_frame(hdev, skb); } /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) { struct sk_buff *skb; BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command"); return -ENOMEM; } /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { struct sk_buff *skb; if (hci_opcode_ogf(opcode) != 0x3f) { /* A controller receiving a command shall respond with either * a Command Status Event or a Command Complete Event. * Therefore, all standard HCI commands must be sent via the * standard API, using hci_send_cmd or hci_cmd_sync helpers. * Some vendors do not comply with this rule for vendor-specific * commands and do not return any event. We want to support * unresponded commands for such cases only. */ bt_dev_err(hdev, "unresponded command not supported"); return -EINVAL; } skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); return -ENOMEM; } hci_send_frame(hdev, skb); return 0; } EXPORT_SYMBOL(__hci_cmd_send); /* Get data from the previously sent command */ static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode) { struct hci_command_hdr *hdr; if (!skb || skb->len < HCI_COMMAND_HDR_SIZE) return NULL; hdr = (void *)skb->data; if (hdr->opcode != cpu_to_le16(opcode)) return NULL; return skb->data + HCI_COMMAND_HDR_SIZE; } /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { void *data; /* Check if opcode matches last sent command */ data = hci_cmd_data(hdev->sent_cmd, opcode); if (!data) /* Check if opcode matches last request */ data = hci_cmd_data(hdev->req_skb, opcode); return data; } /* Get data from last received event */ void *hci_recv_event_data(struct hci_dev *hdev, __u8 event) { struct hci_event_hdr *hdr; int offset; if (!hdev->recv_event) return NULL; hdr = (void *)hdev->recv_event->data; offset = sizeof(*hdr); if (hdr->evt != event) { /* In case of LE metaevent check the subevent match */ if (hdr->evt == HCI_EV_LE_META) { struct hci_ev_le_meta *ev; ev = (void *)hdev->recv_event->data + offset; offset += sizeof(*ev); if (ev->subevent == event) goto found; } return NULL; } found: bt_dev_dbg(hdev, "event 0x%2.2x", event); return hdev->recv_event->data + offset; } /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { struct hci_acl_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ACL_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_acl_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, struct sk_buff *skb, __u16 flags) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; struct sk_buff *list; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); list = skb_shinfo(skb)->frag_list; if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; /* Queue all fragments atomically. We need to use spin_lock_bh * here because of 6LoWPAN links, as there this function is * called from softirq and using normal spin lock could cause * deadlocks. */ spin_lock_bh(&queue->lock); __skb_queue_tail(queue, skb); flags &= ~ACL_START; flags |= ACL_CONT; do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); spin_unlock_bh(&queue->lock); } bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue)); } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) { struct hci_dev *hdev = chan->conn->hdev; BT_DBG("%s chan %p flags 0x%4.4x", hdev->name, chan, flags); hci_queue_acl(chan, &chan->data_q, skb, flags); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send SCO data */ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct hci_sco_hdr hdr; BT_DBG("%s len %d", hdev->name, skb->len); hdr.handle = cpu_to_le16(conn->handle); hdr.dlen = skb->len; skb_push(skb, HCI_SCO_HDR_SIZE); skb_reset_transport_header(skb); memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(&conn->data_q)); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send ISO data */ static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags) { struct hci_iso_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ISO_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_iso_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct sk_buff *list; __u16 flags; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; list = skb_shinfo(skb)->frag_list; flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; __skb_queue_tail(queue, skb); do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); } bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue)); } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s len %d", hdev->name, skb->len); hci_queue_iso(conn, &conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } /* ---- HCI TX task (outgoing data) ---- */ /* HCI Connection scheduler */ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) { struct hci_dev *hdev; int cnt, q; if (!conn) { *quote = 0; return; } hdev = conn->hdev; switch (conn->type) { case ACL_LINK: cnt = hdev->acl_cnt; break; case SCO_LINK: case ESCO_LINK: cnt = hdev->sco_cnt; break; case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; case CIS_LINK: case BIS_LINK: case PA_LINK: cnt = hdev->iso_cnt; break; default: cnt = 0; bt_dev_err(hdev, "unknown link type %d", conn->type); } q = cnt / num; *quote = q ? q : 1; } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; unsigned int num = 0, min = ~0; /* We don't have to lock device here. Connections are always * added and removed with TX task disabled. */ rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != type || skb_queue_empty(&c->data_q)) continue; bt_dev_dbg(hdev, "hcon %p state %s queued %d", c, state_to_string(c->state), skb_queue_len(&c->data_q)); if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; num++; if (c->sent < min) { min = c->sent; conn = c; } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); hci_quote_sent(conn, num, quote); BT_DBG("conn %p quote %d", conn, *quote); return conn; } static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; bt_dev_err(hdev, "link tx timeout"); hci_dev_lock(hdev); /* Kill stalled connections */ list_for_each_entry(c, &h->list, list) { if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); } } hci_dev_unlock(hdev); } static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_chan *chan = NULL; unsigned int num = 0, min = ~0, cur_prio = 0; struct hci_conn *conn; int conn_num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *tmp; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; conn_num++; list_for_each_entry_rcu(tmp, &conn->chan_list, list) { struct sk_buff *skb; if (skb_queue_empty(&tmp->data_q)) continue; skb = skb_peek(&tmp->data_q); if (skb->priority < cur_prio) continue; if (skb->priority > cur_prio) { num = 0; min = ~0; cur_prio = skb->priority; } num++; if (conn->sent < min) { min = conn->sent; chan = tmp; } } if (hci_conn_num(hdev, type) == conn_num) break; } rcu_read_unlock(); if (!chan) return NULL; hci_quote_sent(chan->conn, num, quote); BT_DBG("chan %p quote %d", chan, *quote); return chan; } static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn; int num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *chan; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; num++; list_for_each_entry_rcu(chan, &conn->chan_list, list) { struct sk_buff *skb; if (chan->sent) { chan->sent = 0; continue; } if (skb_queue_empty(&chan->data_q)) continue; skb = skb_peek(&chan->data_q); if (skb->priority >= HCI_PRIO_MAX - 1) continue; skb->priority = HCI_PRIO_MAX - 1; BT_DBG("chan %p skb %p promoted to %d", chan, skb, skb->priority); } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); } static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { case ACL_LINK: /* tx timeout must be longer than maximum link supervision * timeout (40.9 seconds) */ timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; break; case LE_LINK: /* tx timeout must be longer than maximum link supervision * timeout (40.9 seconds) */ timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; case CIS_LINK: case BIS_LINK: case PA_LINK: /* tx timeout must be longer than the maximum transport latency * (8.388607 seconds) */ timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; default: return; } if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } /* Schedule SCO */ static void hci_sched_sco(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; unsigned int pkts = hdev->sco_pkts; bt_dev_dbg(hdev, "type %u", type); if (!hci_conn_num(hdev, type) || !pkts) return; /* Use sco_pkts if flow control has not been enabled which will limit * the amount of buffer sent in a row. */ if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) cnt = &pkts; else cnt = &hdev->sco_cnt; while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } /* Rescheduled if all packets were sent and flow control is not enabled * as there could be more packets queued that could not be sent and * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule * needs to be forced. */ if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) queue_work(hdev->workqueue, &hdev->tx_work); } static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; struct hci_chan *chan; struct sk_buff *skb; int quote; __check_timeout(hdev, cnt, ACL_LINK); while (hdev->acl_cnt && (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); hci_send_conn_frame(hdev, chan->conn, skb); hdev->acl_last_tx = jiffies; hdev->acl_cnt--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (cnt != hdev->acl_cnt) hci_prio_recalculate(hdev, ACL_LINK); } static void hci_sched_acl(struct hci_dev *hdev) { BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ if (!hci_conn_num(hdev, ACL_LINK)) return; hci_sched_acl_pkt(hdev); } static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; struct sk_buff *skb; int quote, *cnt, tmp; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, LE_LINK)) return; cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; __check_timeout(hdev, *cnt, LE_LINK); tmp = *cnt; while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_send_conn_frame(hdev, chan->conn, skb); hdev->le_last_tx = jiffies; (*cnt)--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (*cnt != tmp) hci_prio_recalculate(hdev, LE_LINK); } /* Schedule iso */ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, type)) return; cnt = &hdev->iso_cnt; __check_timeout(hdev, *cnt, type); while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } } static void hci_tx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work); struct sk_buff *skb; BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); hci_sched_iso(hdev, CIS_LINK); hci_sched_iso(hdev, BIS_LINK); hci_sched_iso(hdev, PA_LINK); hci_sched_acl(hdev); hci_sched_le(hdev); } /* Send next queued raw (unknown type) packet */ while ((skb = skb_dequeue(&hdev->raw_q))) hci_send_frame(hdev, skb); } /* ----- HCI RX task (incoming data processing) ----- */ /* ACL data packet */ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr; __u16 handle, flags; int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ACL packet too small"); kfree_skb(skb); return; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.acl_rx++; err = l2cap_recv_acldata(hdev, handle, skb, flags); if (err == -ENOENT) bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); else if (err) bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d", handle, err); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr; __u16 handle, flags; int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "SCO packet too small"); kfree_skb(skb); return; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.sco_rx++; hci_skb_pkt_status(skb) = flags & 0x03; err = sco_recv_scodata(hdev, handle, skb); if (err == -ENOENT) bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); else if (err) bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d", handle, err); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; __u16 handle, flags; int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); kfree_skb(skb); return; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); err = iso_recv(hdev, handle, skb, flags); if (err == -ENOENT) bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); else if (err) bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d", handle, err); } static bool hci_req_is_complete(struct hci_dev *hdev) { struct sk_buff *skb; skb = skb_peek(&hdev->cmd_q); if (!skb) return true; return (bt_cb(skb)->hci.req_flags & HCI_REQ_START); } static void hci_resend_last(struct hci_dev *hdev) { struct hci_command_hdr *sent; struct sk_buff *skb; u16 opcode; if (!hdev->sent_cmd) return; sent = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(sent->opcode); if (opcode == HCI_OP_RESET) return; skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); if (!skb) return; skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb) { struct sk_buff *skb; unsigned long flags; BT_DBG("opcode 0x%04x status 0x%02x", opcode, status); /* If the completed command doesn't match the last one that was * sent we need to do special handling of it. */ if (!hci_sent_cmd_data(hdev, opcode)) { /* Some CSR based controllers generate a spontaneous * reset complete event during init and any pending * command will never be completed. In such a case we * need to resend whatever was the last sent * command. */ if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET) hci_resend_last(hdev); return; } /* If we reach this point this event matches the last command sent */ hci_dev_clear_flag(hdev, HCI_CMD_PENDING); /* If the command succeeded and there's still more commands in * this request the request is not yet complete. */ if (!status && !hci_req_is_complete(hdev)) return; skb = hdev->req_skb; /* If this was the last command in a request the complete * callback would be found in hdev->req_skb instead of the * command queue (hdev->cmd_q). */ if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) { *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; return; } if (skb && bt_cb(skb)->hci.req_complete) { *req_complete = bt_cb(skb)->hci.req_complete; return; } /* Remove all pending commands belonging to this request */ spin_lock_irqsave(&hdev->cmd_q.lock, flags); while ((skb = __skb_dequeue(&hdev->cmd_q))) { if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) { __skb_queue_head(&hdev->cmd_q, skb); break; } if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } static void hci_rx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work); struct sk_buff *skb; BT_DBG("%s", hdev->name); /* The kcov_remote functions used for collecting packet parsing * coverage information from this background thread and associate * the coverage with the syscall's thread which originally injected * the packet. This helps fuzzing the kernel. */ for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* If the device has been opened in HCI_USER_CHANNEL, * the userspace has exclusive access to device. * When device is HCI_INIT, we still need to process * the data packets to the driver in order * to complete its setup(). */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: case HCI_ISODATA_PKT: kfree_skb(skb); continue; } } /* Process frame */ switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: BT_DBG("%s Event packet", hdev->name); hci_event_packet(hdev, skb); break; case HCI_ACLDATA_PKT: BT_DBG("%s ACL data packet", hdev->name); hci_acldata_packet(hdev, skb); break; case HCI_SCODATA_PKT: BT_DBG("%s SCO data packet", hdev->name); hci_scodata_packet(hdev, skb); break; case HCI_ISODATA_PKT: BT_DBG("%s ISO data packet", hdev->name); hci_isodata_packet(hdev, skb); break; default: kfree_skb(skb); break; } } } static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) { int err; bt_dev_dbg(hdev, "skb %p", skb); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); if (!hdev->sent_cmd) { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return -EINVAL; } if (hci_skb_opcode(skb) != HCI_OP_NOP) { err = hci_send_frame(hdev, skb); if (err < 0) { hci_cmd_sync_cancel_sync(hdev, -err); return err; } atomic_dec(&hdev->cmd_cnt); } else { err = -ENODATA; kfree_skb(skb); } if (hdev->req_status == HCI_REQ_PEND && !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) { kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } return err; } static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); struct sk_buff *skb; int err; BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name, atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q)); /* Send queued commands */ if (atomic_read(&hdev->cmd_cnt)) { skb = skb_dequeue(&hdev->cmd_q); if (!skb) return; err = hci_send_cmd_sync(hdev, skb); if (err) return; rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) cancel_delayed_work(&hdev->cmd_timer); else queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, HCI_CMD_TIMEOUT); rcu_read_unlock(); } }
44 23 21 4 4 4 40 21 19 19 5 21 1 1 1 1 1 6 6 6 6 6 20 3 1 1 1 3 3 3 1 3 3 27 27 26 7 5 12 12 67 7 8 42 61 2 61 61 62 1 61 61 61 61 61 11 2 8 1 2 71 3 2 1 66 1 7 62 62 2 1 1 4 1 3 4 8 16 25 3 16 3 154 1 73 2 40 1 10 18 1 93 1 62 29 2 40 39 1 1 27 1 1 1 15 15 3 18 17 18 16 1 1 2 1 1 3 1 2 15 4 4 1 3 1 2 1 6 3 5 4438 4441 1541 11 2 2 30 1 1 1 28 535 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 // SPDX-License-Identifier: GPL-2.0 /* XDP sockets * * AF_XDP sockets allows a channel between XDP programs and userspace * applications. * Copyright(c) 2018 Intel Corporation. * * Author(s): Björn Töpel <bjorn.topel@intel.com> * Magnus Karlsson <magnus.karlsson@intel.com> */ #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ #include <linux/if_xdp.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include "xsk_queue.h" #include "xdp_umem.h" #include "xsk.h" #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 struct xsk_addrs { u32 num_descs; u64 addrs[MAX_SKB_FRAGS + 1]; }; static struct kmem_cache *xsk_tx_generic_cache; void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) return; pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup |= XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_set_rx_need_wakeup); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup |= XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_set_tx_need_wakeup); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) return; pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return pool->uses_need_wakeup; } EXPORT_SYMBOL(xsk_uses_need_wakeup); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) return dev->_rx[queue_id].pool; if (queue_id < dev->real_num_tx_queues) return dev->_tx[queue_id].pool; return NULL; } EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do * not know if the device has more tx queues than rx, or the opposite. * This might also change during run time. */ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { if (queue_id >= max_t(unsigned int, dev->real_num_rx_queues, dev->real_num_tx_queues)) return -EINVAL; if (queue_id < dev->real_num_rx_queues) dev->_rx[queue_id].pool = pool; if (queue_id < dev->real_num_tx_queues) dev->_tx[queue_id].pool = pool; return 0; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u32 flags) { u64 addr; int err; addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; } xp_release(xskb); return 0; } static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u32 frags = xdp_buff_has_frags(xdp); struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; int err; if (frags) contd = XDP_PKT_CONTD; err = __xsk_rcv_zc(xs, xskb, len, contd); if (err) goto err; if (likely(!frags)) return 0; xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; list_del(&pos->list_node); } return 0; err: xsk_buff_free(xdp); return err; } static void *xsk_copy_xdp_start(struct xdp_buff *from) { if (unlikely(xdp_data_meta_unsupported(from))) return from->data; else return from->data_meta; } static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, u32 *from_len, skb_frag_t **frag, u32 rem) { u32 copied = 0; while (1) { u32 copy_len = min_t(u32, *from_len, to_len); memcpy(to, *from, copy_len); copied += copy_len; if (rem == copied) return copied; if (*from_len == copy_len) { *from = skb_frag_address(*frag); *from_len = skb_frag_size((*frag)++); } else { *from += copy_len; *from_len -= copy_len; } if (to_len == copy_len) return copied; to_len -= copy_len; to += copy_len; } } static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; skb_frag_t *frag; from_len = xdp->data_end - copy_from; meta_len = xdp->data - copy_from; rem = len + meta_len; if (len <= frame_size && !xdp_buff_has_frags(xdp)) { int err; xsk_xdp = xsk_buff_alloc(xs->pool); if (!xsk_xdp) { xs->rx_dropped++; return -ENOMEM; } memcpy(xsk_xdp->data - meta_len, copy_from, rem); xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); err = __xsk_rcv_zc(xs, xskb, len, 0); if (err) { xsk_buff_free(xsk_xdp); return err; } return 0; } num_desc = (len - 1) / frame_size + 1; if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { xs->rx_queue_full++; return -ENOBUFS; } if (xdp_buff_has_frags(xdp)) { struct skb_shared_info *sinfo; sinfo = xdp_get_shared_info_from_buff(xdp); frag = &sinfo->frags[0]; } do { u32 to_len = frame_size + meta_len; u32 copied; xsk_xdp = xsk_buff_alloc(xs->pool); copy_to = xsk_xdp->data - meta_len; copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem); rem -= copied; xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); meta_len = 0; } while (rem); return 0; } static bool xsk_tx_writeable(struct xdp_sock *xs) { if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) return false; return true; } static void __xsk_tx_release(struct xdp_sock *xs) { __xskq_cons_release(xs->tx); if (xsk_tx_writeable(xs)) xs->sk.sk_write_space(&xs->sk); } static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { /* Matches smp_wmb() in bind(). */ smp_rmb(); return true; } return false; } static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } return 0; } static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); __xskq_cons_release(xs->pool->fq); sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (!err) { spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); spin_unlock_bh(&xs->pool->rx_lock); } return err; } static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; return xsk_rcv_zc(xs, xdp, len); } err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; } int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { int err; err = xsk_rcv(xs, xdp); if (err) return err; if (!xs->flush_node.prev) { struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); list_add(&xs->flush_node, flush_list); } return 0; } void __xsk_map_flush(struct list_head *flush_list) { struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); __list_del_clearprev(&xs->flush_node); } } void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { xskq_prod_submit_n(pool->cq, nb_entries); } EXPORT_SYMBOL(xsk_tx_completed); void xsk_tx_release(struct xsk_buff_pool *pool) { struct xdp_sock *xs; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) __xsk_tx_release(xs); rcu_read_unlock(); } EXPORT_SYMBOL(xsk_tx_release); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { bool budget_exhausted = false; struct xdp_sock *xs; rcu_read_lock(); again: list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { budget_exhausted = true; continue; } if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { if (xskq_has_descs(xs->tx)) xskq_cons_release(xs->tx); continue; } xs->tx_budget_spent++; /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ if (xskq_prod_reserve_addr(pool->cq, desc->addr)) goto out; xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } if (budget_exhausted) { list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) xs->tx_budget_spent = 0; budget_exhausted = false; goto again; } out: rcu_read_unlock(); return false; } EXPORT_SYMBOL(xsk_tx_peek_desc); static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) { struct xdp_desc *descs = pool->tx_descs; u32 nb_pkts = 0; while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) nb_pkts++; xsk_tx_release(pool); return nb_pkts; } u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) { struct xdp_sock *xs; rcu_read_lock(); if (!list_is_singular(&pool->xsk_tx_list)) { /* Fallback to the non-batched version */ rcu_read_unlock(); return xsk_tx_peek_release_fallback(pool, nb_pkts); } xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); if (!xs) { nb_pkts = 0; goto out; } nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); /* This is the backpressure mechanism for the Tx path. Try to * reserve space in the completion queue for all packets, but * if there are fewer slots available, just process that many * packets. This avoids having to implement any buffering in * the Tx path. */ nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); if (!nb_pkts) goto out; nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); if (!nb_pkts) { xs->tx->queue_empty_descs++; goto out; } __xskq_cons_release(xs->tx); xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); xs->sk.sk_write_space(&xs->sk); out: rcu_read_unlock(); return nb_pkts; } EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { unsigned long flags; int ret; spin_lock_irqsave(&pool->cq_lock, flags); ret = xskq_prod_reserve(pool->cq); spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) { return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; } static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) { return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); } static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); } static void xsk_inc_num_desc(struct sk_buff *skb) { struct xsk_addrs *xsk_addr; if (!xsk_skb_destructor_is_addr(skb)) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; xsk_addr->num_descs++; } } static u32 xsk_get_num_desc(struct sk_buff *skb) { struct xsk_addrs *xsk_addr; if (xsk_skb_destructor_is_addr(skb)) return 1; xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; return xsk_addr->num_descs; } static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, struct sk_buff *skb) { u32 num_descs = xsk_get_num_desc(skb); struct xsk_addrs *xsk_addr; u32 descs_processed = 0; unsigned long flags; u32 idx, i; spin_lock_irqsave(&pool->cq_lock, flags); idx = xskq_get_prod(pool->cq); if (unlikely(num_descs > 1)) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; for (i = 0; i < num_descs; i++) { xskq_prod_write_addr(pool->cq, idx + descs_processed, xsk_addr->addrs[i]); descs_processed++; } kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } else { xskq_prod_write_addr(pool->cq, idx, xsk_skb_destructor_get_addr(skb)); descs_processed++; } xskq_prod_submit_n(pool->cq, descs_processed); spin_unlock_irqrestore(&pool->cq_lock, flags); } static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; spin_lock_irqsave(&pool->cq_lock, flags); xskq_prod_cancel_n(pool->cq, n); spin_unlock_irqrestore(&pool->cq_lock, flags); } static void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; if (compl->tx_timestamp) { /* sw completion timestamp, not a real one */ *compl->tx_timestamp = ktime_get_tai_fast_ns(); } xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, u64 addr) { skb->dev = xs->dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_skb_destructor_set_addr(skb, addr); } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); u32 num_descs = xsk_get_num_desc(skb); struct xsk_addrs *xsk_addr; if (unlikely(num_descs > 1)) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } skb->destructor = sock_wfree; xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; } static void xsk_drop_skb(struct sk_buff *skb) { xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); xsk_consume_skb(skb); } static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, struct xdp_desc *desc, struct xsk_buff_pool *pool, u32 hr) { struct xsk_tx_metadata *meta = NULL; if (unlikely(pool->tx_metadata_len == 0)) return -EINVAL; meta = buffer - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return -EINVAL; if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { if (unlikely(meta->request.csum_start + meta->request.csum_offset + sizeof(__sum16) > desc->len)) return -EINVAL; skb->csum_start = hr + meta->request.csum_start; skb->csum_offset = meta->request.csum_offset; skb->ip_summed = CHECKSUM_PARTIAL; if (unlikely(pool->tx_sw_csum)) { int err; err = skb_checksum_help(skb); if (err) return err; } } if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) skb->skb_mstamp_ns = meta->request.launch_time; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); return 0; } static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; addr = desc->addr; buffer = xsk_buff_raw_get_data(pool, addr); if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); if (unlikely(!skb)) return ERR_PTR(err); skb_reserve(skb, hr); xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); if (unlikely(err)) return ERR_PTR(err); } } else { struct xsk_addrs *xsk_addr; if (xsk_skb_destructor_is_addr(skb)) { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) return ERR_PTR(-ENOMEM); xsk_addr->num_descs = 1; xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; } else { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; } /* in case of -EOVERFLOW that could happen below, * xsk_consume_skb() will release this node as whole skb * would be dropped, which implies freeing all list elements */ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; offset = offset_in_page(buffer); addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { if (unlikely(i >= MAX_SKB_FRAGS)) return ERR_PTR(-EOVERFLOW); page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); copy = min_t(u32, PAGE_SIZE - offset, len - copied); skb_fill_page_desc(skb, i, page, offset, copy); copied += copy; addr += copy; offset = 0; } skb->len += len; skb->data_len += len; skb->truesize += ts; refcount_add(ts, &xs->sk.sk_wmem_alloc); return skb; } static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); skb = NULL; goto free_err; } } else { u32 hr, tr, len; void *buffer; buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); if (unlikely(!skb)) goto free_err; skb_reserve(skb, hr); skb_put(skb, len); err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, xs->pool, hr); if (unlikely(err)) goto free_err; } } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct xsk_addrs *xsk_addr; struct page *page; u8 *vaddr; if (xsk_skb_destructor_is_addr(skb)) { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) { err = -ENOMEM; goto free_err; } xsk_addr->num_descs = 1; xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; } else { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; } if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; } page = alloc_page(xs->sk.sk_allocation); if (unlikely(!page)) { err = -EAGAIN; goto free_err; } vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } } xsk_inc_num_desc(skb); return skb; free_err: if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); if (err == -EOVERFLOW) { /* Drop the packet */ xsk_inc_num_desc(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { /* Let application retry */ xsk_cq_cancel_locked(xs->pool, 1); } return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; u32 max_batch; int err = 0; mutex_lock(&xs->mutex); /* Since we dropped the RCU read lock, the socket state might have changed. */ if (unlikely(!xsk_is_bound(xs))) { err = -ENXIO; goto out; } if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; max_batch = READ_ONCE(xs->max_tx_budget); while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; goto out; } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ err = xsk_cq_reserve_locked(xs->pool); if (err) { err = -EAGAIN; goto out; } skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); if (err != -EOVERFLOW) goto out; err = 0; continue; } xskq_cons_release(xs->tx); if (xp_mb_desc(&desc)) { xs->skb = skb; continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); xsk_consume_skb(skb); err = -EAGAIN; goto out; } /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; xs->skb = NULL; goto out; } sent_frame = true; xs->skb = NULL; } if (xskq_has_descs(xs->tx)) { if (xs->skb) xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } out: if (sent_frame) __xsk_tx_release(xs); mutex_unlock(&xs->mutex); return err; } static int xsk_generic_xmit(struct sock *sk) { int ret; /* Drop the RCU lock since the SKB path might sleep. */ rcu_read_unlock(); ret = __xsk_generic_xmit(sk); /* Reaquire RCU lock before going into common code. */ rcu_read_lock(); return ret; } static bool xsk_no_wakeup(struct sock *sk) { #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif } static int xsk_check_common(struct xdp_sock *xs) { if (unlikely(!xsk_is_bound(xs))) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; return 0; } static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(need_wait)) return -EOPNOTSUPP; if (unlikely(!xs->tx)) return -ENOBUFS; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xs->zc && xsk_no_wakeup(sk)) return 0; pool = xs->pool; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { if (xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_TX); return xsk_generic_xmit(sk); } return 0; } static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; rcu_read_lock(); ret = __xsk_sendmsg(sock, m, total_len); rcu_read_unlock(); return ret; } static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { bool need_wait = !(flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(!xs->rx)) return -ENOBUFS; if (unlikely(need_wait)) return -EOPNOTSUPP; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xsk_no_wakeup(sk)) return 0; if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_RX); return 0; } static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { int ret; rcu_read_lock(); ret = __xsk_recvmsg(sock, m, len, flags); rcu_read_unlock(); return ret; } static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; sock_poll_wait(file, sock, wait); rcu_read_lock(); if (xsk_check_common(xs)) goto out; pool = xs->pool; if (pool->cached_need_wakeup) { if (xs->zc) xsk_wakeup(xs, pool->cached_need_wakeup); else if (xs->tx) /* Poll needs to drive Tx also in copy mode */ xsk_generic_xmit(sk); } if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; out: rcu_read_unlock(); return mask; } static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { struct xsk_queue *q; if (entries == 0 || *queue || !is_power_of_2(entries)) return -EINVAL; q = xskq_create(entries, umem_queue); if (!q) return -ENOMEM; /* Make sure queue is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(*queue, q); return 0; } static void xsk_unbind_dev(struct xdp_sock *xs) { struct net_device *dev = xs->dev; if (xs->state != XSK_BOUND) return; WRITE_ONCE(xs->state, XSK_UNBOUND); /* Wait for driver to stop using the xdp socket. */ xp_del_xsk(xs->pool, xs); synchronize_net(); dev_put(dev); } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; *map_entry = NULL; spin_lock_bh(&xs->map_list_lock); node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } spin_unlock_bh(&xs->map_list_lock); return map; } static void xsk_delete_from_maps(struct xdp_sock *xs) { /* This function removes the current XDP socket from all the * maps it resides in. We need to take extra care here, due to * the two locks involved. Each map has a lock synchronizing * updates to the entries, and each socket has a lock that * synchronizes access to the list of maps (map_list). For * deadlock avoidance the locks need to be taken in the order * "map lock"->"socket map list lock". We start off by * accessing the socket map list, and take a reference to the * map to guarantee existence between the * xsk_get_map_list_entry() and xsk_map_try_sock_delete() * calls. Then we ask the map to remove the socket, which * tries to remove the socket from the map. Note that there * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); bpf_map_put(&map->map); } } static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net *net; if (!sk) return 0; net = sock_net(sk); if (xs->skb) xsk_drop_skb(xs->skb); mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, sk->sk_prot, -1); xsk_delete_from_maps(xs); mutex_lock(&xs->mutex); xsk_unbind_dev(xs); mutex_unlock(&xs->mutex); xskq_destroy(xs->rx); xskq_destroy(xs->tx); xskq_destroy(xs->fq_tmp); xskq_destroy(xs->cq_tmp); sock_orphan(sk); sock->sk = NULL; sock_put(sk); return 0; } static struct socket *xsk_lookup_xsk_from_fd(int fd) { struct socket *sock; int err; sock = sockfd_lookup(fd, &err); if (!sock) return ERR_PTR(-ENOTSOCK); if (sock->sk->sk_family != PF_XDP) { sockfd_put(sock); return ERR_PTR(-ENOPROTOOPT); } return sock; } static bool xsk_validate_queues(struct xdp_sock *xs) { return xs->fq_tmp && xs->cq_tmp; } static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; int bound_dev_if; u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) return -EINVAL; if (sxdp->sxdp_family != AF_XDP) return -EINVAL; flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) return -EINVAL; rtnl_lock(); mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { err = -EBUSY; goto out_release; } dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; goto out_release; } netdev_lock_ops(dev); if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; } qid = sxdp->sxdp_queue_id; if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; } if (xs->umem) { /* We have already our own. */ err = -EINVAL; goto out_unlock; } sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); if (IS_ERR(sock)) { err = PTR_ERR(sock); goto out_unlock; } umem_xs = xdp_sk(sock->sk); if (!xsk_is_bound(umem_xs)) { err = -EBADF; sockfd_put(sock); goto out_unlock; } if (umem_xs->queue_id != qid || umem_xs->dev != dev) { /* Share the umem with another socket on another qid * and/or device. */ xs->pool = xp_create_and_assign_umem(xs, umem_xs->umem); if (!xs->pool) { err = -ENOMEM; sockfd_put(sock); goto out_unlock; } err = xp_assign_dev_shared(xs->pool, umem_xs, dev, qid); if (err) { xp_destroy(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } else { /* Share the buffer pool with the other socket. */ if (xs->fq_tmp || xs->cq_tmp) { /* Do not allow setting your own fq or cq. */ err = -EINVAL; sockfd_put(sock); goto out_unlock; } xp_get_pool(umem_xs->pool); xs->pool = umem_xs->pool; /* If underlying shared umem was created without Tx * ring, allocate Tx descs array that Tx batching API * utilizes */ if (xs->tx && !xs->pool->tx_descs) { err = xp_alloc_tx_descs(xs->pool, xs); if (err) { xp_put_pool(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } } xdp_get_umem(umem_xs->umem); WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); } else if (!xs->umem || !xsk_validate_queues(xs)) { err = -EINVAL; goto out_unlock; } else { /* This xsk has its own umem. */ xs->pool = xp_create_and_assign_umem(xs, xs->umem); if (!xs->pool) { err = -ENOMEM; goto out_unlock; } err = xp_assign_dev(xs->pool, dev, qid, flags); if (err) { xp_destroy(xs->pool); xs->pool = NULL; goto out_unlock; } } /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ xs->fq_tmp = NULL; xs->cq_tmp = NULL; xs->dev = dev; xs->zc = xs->umem->zc; xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); if (qid < dev->real_num_rx_queues) { struct netdev_rx_queue *rxq; rxq = __netif_get_rx_queue(dev, qid); if (rxq->napi) __sk_mark_napi_id_once(sk, rxq->napi->napi_id); } out_unlock: if (err) { dev_put(dev); } else { /* Matches smp_rmb() in bind() for shared umem * sockets, and xsk_is_bound(). */ smp_wmb(); WRITE_ONCE(xs->state, XSK_BOUND); } netdev_unlock_ops(dev); out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); return err; } struct xdp_umem_reg_v1 { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ __u32 chunk_size; __u32 headroom; }; static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; if (level != SOL_XDP) return -ENOPROTOOPT; switch (optname) { case XDP_RX_RING: case XDP_TX_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); if (!err && optname == XDP_TX_RING) /* Tx needs to be explicitly woken up the first time */ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; mutex_unlock(&xs->mutex); return err; } case XDP_UMEM_REG: { size_t mr_size = sizeof(struct xdp_umem_reg); struct xdp_umem_reg mr = {}; struct xdp_umem *umem; if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); /* Make sure the last field of the struct doesn't have * uninitialized padding. All padding has to be explicit * and has to be set to zero by the userspace to make * struct xdp_umem_reg extensible in the future. */ BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + sizeof_field(struct xdp_umem_reg, tx_metadata_len) != sizeof(struct xdp_umem_reg)); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY || xs->umem) { mutex_unlock(&xs->mutex); return -EBUSY; } umem = xdp_umem_create(&mr); if (IS_ERR(umem)) { mutex_unlock(&xs->mutex); return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(xs->umem, umem); mutex_unlock(&xs->mutex); return 0; } case XDP_UMEM_FILL_RING: case XDP_UMEM_COMPLETION_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : &xs->cq_tmp; err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; } case XDP_MAX_TX_SKB_BUDGET: { unsigned int budget; if (optlen != sizeof(budget)) return -EINVAL; if (copy_from_sockptr(&budget, optval, sizeof(budget))) return -EFAULT; if (!xs->tx || budget < TX_BATCH_SIZE || budget > xs->tx->nentries) return -EACCES; WRITE_ONCE(xs->max_tx_budget, budget); return 0; } default: break; } return -ENOPROTOOPT; } static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_rxtx_ring, desc); } static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_umem_ring, desc); } struct xdp_statistics_v1 { __u64 rx_dropped; __u64 rx_invalid_descs; __u64 tx_invalid_descs; }; static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int len; if (level != SOL_XDP) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case XDP_STATISTICS: { struct xdp_statistics stats = {}; bool extra_stats = true; size_t stats_size; if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; } else if (len < sizeof(stats)) { extra_stats = false; stats_size = sizeof(struct xdp_statistics_v1); } else { stats_size = sizeof(stats); } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; if (extra_stats) { stats.rx_ring_full = xs->rx_queue_full; stats.rx_fill_ring_empty_descs = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); } else { stats.rx_dropped += xs->rx_queue_full; } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; if (put_user(stats_size, optlen)) return -EFAULT; return 0; } case XDP_MMAP_OFFSETS: { struct xdp_mmap_offsets off; struct xdp_mmap_offsets_v1 off_v1; bool flags_supported = true; void *to_copy; if (len < sizeof(off_v1)) return -EINVAL; else if (len < sizeof(off)) flags_supported = false; if (flags_supported) { /* xdp_ring_offset is identical to xdp_ring_offset_v1 * except for the flags field added to the end. */ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.rx); xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.tx); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.fr); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.cr); off.rx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.tx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.fr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); off.cr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); len = sizeof(off); to_copy = &off; } else { xsk_enter_rxtx_offsets(&off_v1.rx); xsk_enter_rxtx_offsets(&off_v1.tx); xsk_enter_umem_offsets(&off_v1.fr); xsk_enter_umem_offsets(&off_v1.cr); len = sizeof(off_v1); to_copy = &off_v1; } if (copy_to_user(optval, to_copy, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } case XDP_OPTIONS: { struct xdp_options opts = {}; if (len < sizeof(opts)) return -EINVAL; mutex_lock(&xs->mutex); if (xs->zc) opts.flags |= XDP_OPTIONS_ZEROCOPY; mutex_unlock(&xs->mutex); len = sizeof(opts); if (copy_to_user(optval, &opts, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } default: break; } return -EOPNOTSUPP; } static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); int state = READ_ONCE(xs->state); struct xsk_queue *q = NULL; if (state != XSK_READY && state != XSK_BOUND) return -EBUSY; if (offset == XDP_PGOFF_RX_RING) { q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { q = READ_ONCE(xs->tx); } else { /* Matches the smp_wmb() in XDP_UMEM_REG */ smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : READ_ONCE(xs->pool->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : READ_ONCE(xs->pool->cq); } if (!q) return -EINVAL; /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); if (size > q->ring_vmalloc_size) return -EINVAL; return remap_vmalloc_range(vma, q->ring, 0); } static int xsk_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct sock *sk; switch (msg) { case NETDEV_UNREGISTER: mutex_lock(&net->xdp.lock); sk_for_each(sk, &net->xdp.list) { struct xdp_sock *xs = xdp_sk(sk); mutex_lock(&xs->mutex); if (xs->dev == dev) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); xsk_unbind_dev(xs); /* Clear device references. */ xp_clear_dev(xs->pool); } mutex_unlock(&xs->mutex); } mutex_unlock(&net->xdp.lock); break; } return NOTIFY_DONE; } static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, .obj_size = sizeof(struct xdp_sock), }; static const struct proto_ops xsk_proto_ops = { .family = PF_XDP, .owner = THIS_MODULE, .release = xsk_release, .bind = xsk_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, }; static void xsk_destruct(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); if (!sock_flag(sk, SOCK_DEAD)) return; if (!xp_put_pool(xs->pool)) xdp_put_umem(xs->umem, !xs->pool); } static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { struct xdp_sock *xs; struct sock *sk; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); if (!sk) return -ENOBUFS; sock->ops = &xsk_proto_ops; sock_init_data(sock, sk); sk->sk_family = PF_XDP; sk->sk_destruct = xsk_destruct; sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); xs->state = XSK_READY; xs->max_tx_budget = TX_BATCH_SIZE; mutex_init(&xs->mutex); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); mutex_lock(&net->xdp.lock); sk_add_node_rcu(sk, &net->xdp.list); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, &xsk_proto, 1); return 0; } static const struct net_proto_family xsk_family_ops = { .family = PF_XDP, .create = xsk_create, .owner = THIS_MODULE, }; static struct notifier_block xsk_netdev_notifier = { .notifier_call = xsk_notifier, }; static int __net_init xsk_net_init(struct net *net) { mutex_init(&net->xdp.lock); INIT_HLIST_HEAD(&net->xdp.list); return 0; } static void __net_exit xsk_net_exit(struct net *net) { WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); } static struct pernet_operations xsk_net_ops = { .init = xsk_net_init, .exit = xsk_net_exit, }; static int __init xsk_init(void) { int err; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) goto out; err = sock_register(&xsk_family_ops); if (err) goto out_proto; err = register_pernet_subsys(&xsk_net_ops); if (err) goto out_sk; err = register_netdevice_notifier(&xsk_netdev_notifier); if (err) goto out_pernet; xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", sizeof(struct xsk_addrs), 0, SLAB_HWCACHE_ALIGN, NULL); if (!xsk_tx_generic_cache) { err = -ENOMEM; goto out_unreg_notif; } return 0; out_unreg_notif: unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: sock_unregister(PF_XDP); out_proto: proto_unregister(&xsk_proto); out: return err; } fs_initcall(xsk_init);
142 294 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_NAPI_H #define IOU_NAPI_H #include <linux/kernel.h> #include <linux/io_uring.h> #include <net/busy_poll.h> #ifdef CONFIG_NET_RX_BUSY_POLL void io_napi_init(struct io_ring_ctx *ctx); void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); static inline bool io_napi(struct io_ring_ctx *ctx) { return !list_empty(&ctx->napi_list); } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { if (!io_napi(ctx)) return; __io_napi_busy_loop(ctx, iowq); } /* * io_napi_add() - Add napi id to the busy poll list * @req: pointer to io_kiocb request * * Add the napi id of the socket to the napi busy poll list and hash table. */ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) return; sock = sock_from_file(req->file); if (sock && sock->sk) __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); } #else static inline void io_napi_init(struct io_ring_ctx *ctx) { } static inline void io_napi_free(struct io_ring_ctx *ctx) { } static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline bool io_napi(struct io_ring_ctx *ctx) { return false; } static inline void io_napi_add(struct io_kiocb *req) { } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { } static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { return 0; } #endif /* CONFIG_NET_RX_BUSY_POLL */ #endif
2 2 38 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_GUE_H #define __NET_GUE_H /* Definitions for the GUE header, standard and private flags, lengths * of optional fields are below. * * Diagram of GUE header: * * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |Ver|C| Hlen | Proto/ctype | Standard flags |P| * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * ~ Fields (optional) ~ * | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Private flags (optional, P bit is set) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * ~ Private fields (optional) ~ * | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * C bit indicates control message when set, data message when unset. * For a control message, proto/ctype is interpreted as a type of * control message. For data messages, proto/ctype is the IP protocol * of the next header. * * P bit indicates private flags field is present. The private flags * may refer to options placed after this field. */ #include <asm/byteorder.h> #include <linux/types.h> struct guehdr { union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 hlen:5, control:1, version:2; #elif defined (__BIG_ENDIAN_BITFIELD) __u8 version:2, control:1, hlen:5; #else #error "Please fix <asm/byteorder.h>" #endif __u8 proto_ctype; __be16 flags; }; __be32 word; }; }; /* Standard flags in GUE header */ #define GUE_FLAG_PRIV htons(1<<0) /* Private flags are in options */ #define GUE_LEN_PRIV 4 #define GUE_FLAGS_ALL (GUE_FLAG_PRIV) /* Private flags in the private option extension */ #define GUE_PFLAG_REMCSUM htonl(1U << 31) #define GUE_PLEN_REMCSUM 4 #define GUE_PFLAGS_ALL (GUE_PFLAG_REMCSUM) /* Functions to compute options length corresponding to flags. * If we ever have a lot of flags this can be potentially be * converted to a more optimized algorithm (table lookup * for instance). */ static inline size_t guehdr_flags_len(__be16 flags) { return ((flags & GUE_FLAG_PRIV) ? GUE_LEN_PRIV : 0); } static inline size_t guehdr_priv_flags_len(__be32 flags) { return 0; } /* Validate standard and private flags. Returns non-zero (meaning invalid) * if there is an unknown standard or private flags, or the options length for * the flags exceeds the options length specific in hlen of the GUE header. */ static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen) { __be16 flags = guehdr->flags; size_t len; if (flags & ~GUE_FLAGS_ALL) return 1; len = guehdr_flags_len(flags); if (len > optlen) return 1; if (flags & GUE_FLAG_PRIV) { /* Private flags are last four bytes accounted in * guehdr_flags_len */ __be32 pflags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV); if (pflags & ~GUE_PFLAGS_ALL) return 1; len += guehdr_priv_flags_len(pflags); if (len > optlen) return 1; } return 0; } #endif
553 118 469 99 88 88 88 79 328 68 470 118 191 591 44 429 428 429 428 710 239 4320 9 17 40 149 150 23 4293 4292 4293 17 4291 553 469 552 4236 199 243 243 2 2 74 22 56 2 3763 6 132 3703 74 88 429 414 18 429 13 4 17 56 464 143 278 4202 4201 14 17 947 943 518 4187 4184 4190 597 598 103 51 51 32 14 42 3 3 112 44 84 106 105 3 103 106 106 106 101 5 2 154 42 112 112 112 37 106 19 598 3 3 3 3 17 6 6 30 30 30 12 12 12 214 11 9 214 216 214 170 44 9 213 214 17 17 17 17 88 88 88 88 99 99 99 99 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> */ #include <linux/mm.h> #include <linux/swap.h> #include <linux/bio-integrity.h> #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/highmem.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" #define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_MAX 256 struct bio_alloc_cache { struct bio *free_list; struct bio *free_list_irq; unsigned int nr; unsigned int nr_irq; }; static struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; } bvec_slabs[] __read_mostly = { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) { switch (nr_vecs) { /* smaller bios use inline vecs */ case 5 ... 16: return &bvec_slabs[0]; case 17 ... 64: return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); return NULL; } } /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ struct bio_set fs_bio_set; EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management */ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); static struct bio_slab *create_bio_slab(unsigned int size) { struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); if (!bslab) return NULL; snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; bslab->slab_ref = 1; bslab->slab_size = size; if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) return bslab; kmem_cache_destroy(bslab->slab); fail_alloc_slab: kfree(bslab); return NULL; } static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) { unsigned int size = bs_bio_slab_size(bs); struct bio_slab *bslab; mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, size); if (bslab) bslab->slab_ref++; else bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); if (bslab) return bslab->slab; return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; WARN_ON_ONCE(bslab->slab != bs->bio_slab); WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; xa_erase(&bio_slabs, slab_size); kmem_cache_destroy(bslab->slab); kfree(bslab); out: mutex_unlock(&bio_slab_lock); } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask) { struct biovec_slab *bvs = biovec_slab(*nr_vecs); if (WARN_ON_ONCE(!bvs)) return NULL; /* * Upgrade the nr_vecs request to take full advantage of the allocation. * We also rely on this in the bvec_free path. */ *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_VECS entries. */ if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio_integrity(bio)) bio_integrity_free(bio); bio_crypt_free_ctx(bio); } EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; void *p = bio; WARN_ON_ONCE(!bs); bio_uninit(bio); bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); mempool_free(p - bs->front_pad, &bs->bio_pool); } /* * Users of this function have their own bio allocation. Subsequently, * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf) { bio->bi_next = NULL; bio->bi_bdev = bdev; bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION bio->bi_crypt_context = NULL; #endif #ifdef CONFIG_BLK_DEV_INTEGRITY bio->bi_integrity = NULL; #endif bio->bi_vcnt = 0; atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset * @bdev: block device to use the bio for * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly * allocated bio returned bio bio_alloc_bioset() - the only fields that are * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; } static void bio_chain_endio(struct bio *bio) { bio_endio(__bio_chain_endio(bio)); } /** * bio_chain - chain bio completions * @bio: the target bio * @parent: the parent bio of @bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have * completed; the chained bio will also be freed when it completes. * * The caller must not set bi_private or bi_end_io in @bio. */ void bio_chain(struct bio *bio, struct bio *parent) { BUG_ON(bio->bi_private || bio->bi_end_io); bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); /** * bio_chain_and_submit - submit a bio after chaining it to another one * @prev: bio to chain and submit * @new: bio to chain to * * If @prev is non-NULL, chain it to @new and submit it. * * Return: @new. */ struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { if (prev) { bio_chain(prev, new); submit_bio(prev); } return new; } struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) { return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); } EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); struct bio *bio; while (1) { spin_lock(&bs->rescue_lock); bio = bio_list_pop(&bs->rescue_list); spin_unlock(&bs->rescue_lock); if (!bio) break; submit_bio_noacct(bio); } } static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; if (WARN_ON_ONCE(!bs->rescue_workqueue)) return; /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on * there for a stacking driver higher up in the stack, processing it * could require allocating bios from this bio_set, and doing that from * our own rescuer would be bad. * * Since bio lists are singly linked, pop them all instead of trying to * remove from the middle of the list: */ bio_list_init(&punt); bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[0] = nopunt; bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[1]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); spin_unlock(&bs->rescue_lock); queue_work(bs->rescue_workqueue, &bs->rescue_work); } static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) { unsigned long flags; /* cache->free_list must be empty */ if (WARN_ON_ONCE(cache->free_list)) return; local_irq_save(flags); cache->free_list = cache->free_list_irq; cache->free_list_irq = NULL; cache->nr += cache->nr_irq; cache->nr_irq = 0; local_irq_restore(flags); } static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; cache = per_cpu_ptr(bs->cache, get_cpu()); if (!cache->free_list) { if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) bio_alloc_irq_cache_splice(cache); if (!cache->free_list) { put_cpu(); return NULL; } } bio = cache->free_list; cache->free_list = bio->bi_next; cache->nr--; put_cpu(); if (nr_vecs) bio_init_inline(bio, bdev, nr_vecs, opf); else bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. * * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to * allocate a bio. This is due to the mempool guarantees. To make this work, * callers must never allocate more than 1 bio at a time from the general pool. * Callers that need to allocate more than 1 bio must always submit the * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * * Note that when running under submit_bio_noacct() (i.e. any block driver), * bios are not submitted until after you return - see the code in * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under submit_bio_noacct() * would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; struct bio *bio; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; if (opf & REQ_ALLOC_CACHE) { if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, gfp_mask, bs); if (bio) return bio; /* * No cached bio available, bio returned below marked with * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache. */ } else { opf &= ~REQ_ALLOC_CACHE; } } /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be * submitted (and thus freed) until after we return. * * This exposes us to a potential deadlock if we allocate multiple bios * from the same bio_set() while running underneath submit_bio_noacct(). * If we were to allocate multiple bios (say a stacking block driver * that was splitting bios), we would deadlock if we exhausted the * mempool's reserve. * * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are bios on * current->bio_list, we first try the allocation without * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be * blocking to the rescuer workqueue before we retry with the original * gfp_flags. */ if (current->bio_list && (!bio_list_empty(&current->bio_list[0]) || !bio_list_empty(&current->bio_list[1])) && bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(&bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); } if (unlikely(!p)) return NULL; if (!mempool_is_saturated(&bs->bio_pool)) opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { struct bio_vec *bvl = NULL; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } if (unlikely(!bvl)) goto err_free; bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; return bio; err_free: mempool_free(p, &bs->bio_pool); return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); /** * bio_kmalloc - kmalloc a bio * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator * * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized * using bio_init() before use. To free a bio returned from this function use * kfree() after calling bio_uninit(). A bio returned from this function can * be reused by calling bio_uninit() before calling bio_init() again. * * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this * function are not backed by a mempool can fail. Do not use this function * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size * @bio: the bio to be truncated * @new_size: new size for truncating the bio * * Description: * Truncate the bio to new size of @new_size. If bio_op(bio) is * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; unsigned int done = 0; bool truncated = false; if (new_size >= bio->bi_iter.bi_size) return; if (bio_op(bio) != REQ_OP_READ) goto exit; bio_for_each_segment(bv, bio, iter) { if (done + bv.bv_len > new_size) { size_t offset; if (!truncated) offset = new_size - done; else offset = 0; memzero_page(bv.bv_page, bv.bv_offset + offset, bv.bv_len - offset); truncated = true; } done += bv.bv_len; } exit: /* * Don't touch bvec table here and make it really immutable, since * fs bio user has to retrieve all pages via bio_for_each_segment_all * in its .end_bio() callback. * * It is enough to truncate bio by updating .bi_size since we can make * correct bvec with the updated .bi_size for drivers. */ bio->bi_iter.bi_size = new_size; } /** * guard_bio_eod - truncate a BIO to fit the block device * @bio: bio to truncate * * This allows us to do IO even on the odd last sectors of a device, even if the * block size is some multiple of the physical sector size. * * We'll just truncate the bio to the size of the device, and clear the end of * the buffer head manually. Truly out-of-range accesses will turn into actual * I/O errors, this only handles the "we need to be able to do I/O at the final * sector" case. */ void guard_bio_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; /* * If the *whole* IO is past the end of the device, * let it through, and the IO layer will turn it into * an EIO. */ if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; maxsector -= bio->bi_iter.bi_sector; if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; bio_truncate(bio, maxsector << 9); } static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { unsigned int i = 0; struct bio *bio; while ((bio = cache->free_list) != NULL) { cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) break; } return i; } static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { nr -= __bio_alloc_cache_prune(cache, nr); if (!READ_ONCE(cache->free_list)) { bio_alloc_irq_cache_splice(cache); __bio_alloc_cache_prune(cache, nr); } } static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct bio_set *bs; bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); if (bs->cache) { struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } return 0; } static void bio_alloc_cache_destroy(struct bio_set *bs) { int cpu; if (!bs->cache) return; cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); for_each_possible_cpu(cpu) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } free_percpu(bs->cache); bs->cache = NULL; } static inline void bio_put_percpu_cache(struct bio *bio) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) goto out_free; if (in_task()) { bio_uninit(bio); bio->bi_next = cache->free_list; /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; } else if (in_hardirq()) { lockdep_assert_irqs_disabled(); bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; } else { goto out_free; } put_cpu(); return; out_free: put_cpu(); bio_free(bio); } /** * bio_put - release a reference to a bio * @bio: bio to release reference to * * Description: * Put a reference to a &struct bio, either one you have gotten with * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { if (unlikely(bio_flagged(bio, BIO_REFFED))) { BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } if (bio->bi_opf & REQ_ALLOC_CACHE) bio_put_percpu_cache(bio); else bio_free(bio); } EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { if (bio->bi_bdev == bio_src->bi_bdev && bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio_clone_blkg_association(bio, bio_src); } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; if (bio_integrity(bio_src) && bio_integrity_clone(bio, bio_src, gfp) < 0) return -ENOMEM; return 0; } /** * bio_alloc_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio_src: bio to clone from * @gfp: allocation priority * @bs: bio_set to allocate from * * Allocate a new bio that is a clone of @bio_src. The caller owns the returned * bio, but not the actual data it points to. * * The caller must ensure that the return bio is not freed before @bio_src. */ struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs) { struct bio *bio; bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); if (!bio) return NULL; if (__bio_clone(bio, bio_src, gfp) < 0) { bio_put(bio); return NULL; } bio->bi_io_vec = bio_src->bi_io_vec; return bio; } EXPORT_SYMBOL(bio_alloc_clone); /** * bio_init_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio: bio to clone into * @bio_src: bio to clone from * @gfp: allocation priority * * Initialize a new bio in caller provided memory that is a clone of @bio_src. * The caller owns the returned bio, but not the actual data it points to. * * The caller must ensure that @bio_src is not freed before @bio. */ int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp) { int ret; bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); ret = __bio_clone(bio, bio_src, gfp); if (ret) bio_uninit(bio); return ret; } EXPORT_SYMBOL(bio_init_clone); /** * bio_full - check if the bio is full * @bio: bio to check * @len: length of one segment to be added * * Return true if @bio is full and one segment with @len bytes can't be * added to the bio, otherwise return false */ static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) return true; if (bio->bi_iter.bi_size > UINT_MAX - len) return true; return false; } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t page_addr = page_to_phys(page); if (vec_end_addr + 1 != page_addr + off) return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) return false; } bv->bv_len += len; return true; } /* * Try to merge a page into a segment, while obeying the hardware segment * size limit. * * This is kept around for the integrity metadata, which is still tries * to build the initial bio to the hardware limit and doesn't have proper * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset); } /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add * @len: length of the data to add, may cross pages * @off: offset of the data relative to @page, may cross pages * * Add the data at @page + @off to @bio as a new bvec. The caller must ensure * that @bio has space for another bvec. */ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; } EXPORT_SYMBOL_GPL(__bio_add_page); /** * bio_add_virt_nofail - add data in the direct kernel mapping to a bio * @bio: destination bio * @vaddr: data to add * @len: length of the data to add, may cross pages * * Add the data at @vaddr to @bio. The caller must have ensure a segment * is available for the added data. No merging into an existing segment * will be performed. */ void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) { __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); } EXPORT_SYMBOL_GPL(bio_add_virt_nofail); /** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add * @len: vec entry length, may cross pages * @offset: vec entry offset relative to @page, may cross pages * * Attempt to add page(s) to the bio_vec maplist. This will only fail * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return 0; if (bvec_try_merge_page(bv, page, len, offset)) { bio->bi_iter.bi_size += len; return len; } } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; WARN_ON_ONCE(len > UINT_MAX); __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** * bio_add_folio - Attempt to add part of a folio to a bio. * @bio: BIO to add to. * @folio: Folio to add. * @len: How many bytes from the folio to add. * @off: First byte in this folio to add. * * Filesystems that use folios can call this function instead of calling * bio_add_page() for each page in the folio. If @off is bigger than * PAGE_SIZE, this function can create a bio_vec that starts in a page * after the bv_page. BIOs do not support folios that are 4GiB or larger. * * Return: Whether the addition was successful. */ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; if (len > UINT_MAX) return false; return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); /** * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio and return how many bytes were added. * This may be less than the amount originally asked. Returns 0 if no data * could be added to @bio. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) { unsigned int offset = offset_in_page(vaddr); len = min(len, PAGE_SIZE - offset); if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) return 0; if (op_is_write(bio_op(bio))) flush_kernel_vmap_range(vaddr, len); return len; } EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); /** * bio_add_vmalloc - add a vmalloc region to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio. Return %true on success or %false if * @bio does not have enough space for the payload. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) { do { unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); if (!added) return false; vaddr += added; len -= added; } while (len); return true; } EXPORT_SYMBOL_GPL(bio_add_vmalloc); void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, size_t offset) { size_t bytes = left; size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); unsigned int j; /* * We might COW a single page in the middle of * a large folio, so we have to check that all * pages belong to the same folio. */ bytes -= contig_sz; for (j = i + 1; j < i + *num_pages; j++) { size_t next = min_t(size_t, PAGE_SIZE, bytes); if (page_folio(pages[j]) != folio || pages[j] != pages[j - 1] + 1) { break; } contig_sz += next; bytes -= next; } *num_pages = j - i; return contig_sz; } #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * * Extracts pages from *iter and appends them to @bio's bvec array. The pages * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; size_t offset, folio_offset, left, len; int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) extraction_flags |= ITER_ALLOW_P2PDMA; size = iov_iter_extract_pages(iter, &pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); unsigned int old_vcnt = bio->bi_vcnt; folio_offset = ((size_t)folio_page_idx(folio, page) << PAGE_SHIFT) + offset; len = min(folio_size(folio) - folio_offset, left); num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); if (num_pages > 1) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); if (!bio_add_folio(bio, folio, len, folio_offset)) { WARN_ON_ONCE(1); ret = -EINVAL; goto out; } if (bio_flagged(bio, BIO_PAGE_PINNED)) { /* * We're adding another fragment of a page that already * was part of the last segment. Undo our pin as the * page was pinned when an earlier fragment of it was * added to the bio and __bio_release_pages expects a * single pin per page. */ if (offset && bio->bi_vcnt == old_vcnt) unpin_user_folio(folio, 1); } offset = 0; } iov_iter_revert(iter, left); out: while (i < nr_pages) bio_release_page(bio, pages[i++]); return ret; } /* * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length * for the next iteration. */ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { size_t nbytes = bio->bi_iter.bi_size & len_align_mask; if (!nbytes) return 0; iov_iter_revert(iter, nbytes); bio->bi_iter.bi_size -= nbytes; do { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (nbytes < bv->bv_len) { bv->bv_len -= nbytes; break; } bio_release_page(bio, bv->bv_page); bio->bi_vcnt--; nbytes -= bv->bv_len; } while (nbytes); if (!bio->bi_vcnt) return -EFAULT; return 0; } /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs * to ensure the bvecs and pages stay referenced until the submitted I/O is * completed by a call to ->ki_complete() or returns with an error other than * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { int ret = 0; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); if (bio->bi_vcnt) return bio_iov_iter_align_down(bio, iter, len_align_mask); return ret; } static void submit_bio_wait_endio(struct bio *bio) { complete(bio->bi_private); } /** * submit_bio_wait - submit a bio, and wait until it completes * @bio: The &struct bio which describes the I/O * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. * * WARNING: Unlike to how submit_bio() is usually used, this function does not * result in bio reference to be consumed. The caller must drop the reference * on his own. */ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); /** * bdev_rw_virt - synchronously read into / write from kernel mapping * @bdev: block device to access * @sector: sector to access * @data: data to read/write * @len: length in byte to read/write * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) * * Performs synchronous I/O to @bdev for @data/@len. @data must be in * the kernel direct mapping and not a vmalloc address. */ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op) { struct bio_vec bv; struct bio bio; int error; if (WARN_ON_ONCE(is_vmalloc_addr(data))) return -EIO; bio_init(&bio, bdev, &bv, 1, op); bio.bi_iter.bi_sector = sector; bio_add_virt_nofail(&bio, data, len); error = submit_bio_wait(&bio); bio_uninit(&bio); return error; } EXPORT_SYMBOL_GPL(bdev_rw_virt); static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); bio_put(bio); } /* * bio_await_chain - ends @bio and waits for every chained bio to complete */ void bio_await_chain(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = bio_wait_end_io; bio_endio(bio); blk_wait_io(&done); } void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { while (src_iter->bi_size && dst_iter->bi_size) { struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); void *src_buf = bvec_kmap_local(&src_bv); void *dst_buf = bvec_kmap_local(&dst_bv); memcpy(dst_buf, src_buf, bytes); kunmap_local(dst_buf); kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio * @dst: destination bio * * Stops when it reaches the end of either @src or @dst - that is, copies * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). */ void bio_copy_data(struct bio *dst, struct bio *src) { struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } EXPORT_SYMBOL(bio_copy_data); void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the * deferred bio dirtying paths. */ /* * bio_set_pages_dirty() will mark all the bio's pages as dirty. */ void bio_set_pages_dirty(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will unpin each page and will run one bio_put() against the * BIO. */ static void bio_dirty_fn(struct work_struct *work); static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); static DEFINE_SPINLOCK(bio_dirty_lock); static struct bio *bio_dirty_list; /* * This runs in process context */ static void bio_dirty_fn(struct work_struct *work) { struct bio *bio, *next; spin_lock_irq(&bio_dirty_lock); next = bio_dirty_list; bio_dirty_list = NULL; spin_unlock_irq(&bio_dirty_lock); while ((bio = next) != NULL) { next = bio->bi_private; bio_release_pages(bio, true); bio_put(bio); } } void bio_check_pages_dirty(struct bio *bio) { struct folio_iter fi; unsigned long flags; bio_for_each_folio_all(fi, bio) { if (!folio_test_dirty(fi.folio)) goto defer; } bio_release_pages(bio, false); bio_put(bio); return; defer: spin_lock_irqsave(&bio_dirty_lock, flags); bio->bi_private = bio_dirty_list; bio_dirty_list = bio; spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { /* * If we're not chaining, then ->__bi_remaining is always 1 and * we always end io on the first invocation. */ if (!bio_flagged(bio, BIO_CHAIN)) return true; BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); if (atomic_dec_and_test(&bio->__bi_remaining)) { bio_clear_flag(bio, BIO_CHAIN); return true; } return false; } /** * bio_endio - end I/O on a bio * @bio: bio * * Description: * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred * way to end I/O on a bio. No one should call bi_end_io() directly on a * bio unless they own it and thus know that it has an end_io function. * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the * last time. **/ void bio_endio(struct bio *bio) { again: if (!bio_remaining_done(bio)) return; if (!bio_integrity_endio(bio)) return; blk_zone_bio_endio(bio); rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that * save/restore bi_end_io) - however, we want to avoid unbounded * recursion and blowing the stack. Tail call optimization would * handle this, but compiling with frame pointers also disables * gcc's sibling call optimization. */ if (bio->bi_end_io == bio_chain_endio) { bio = __bio_chain_endio(bio); goto again; } #ifdef CONFIG_BLK_CGROUP /* * Release cgroup info. We shouldn't have to do this here, but quite * a few callers of bio_init fail to call bio_uninit, so we cover up * for that here at least for now. */ if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio->bi_end_io) bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); /** * bio_split - split a bio * @bio: bio to split * @sectors: number of sectors to split from the front of @bio * @gfp: gfp mask * @bs: bio set to allocate from * * Allocates and returns a new bio which represents @sectors from the start of * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point * to @bio's bi_io_vec. It is the caller's responsibility to ensure that * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) { struct bio *split; if (WARN_ON_ONCE(sectors <= 0)) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return ERR_PTR(-EINVAL); /* atomic writes cannot be split */ if (bio->bi_opf & REQ_ATOMIC) return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; if (bio_integrity(split)) bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } EXPORT_SYMBOL(bio_split); /** * bio_trim - trim a bio * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors * * This function is typically used for bios that are cloned and submitted * to the underlying device in parts. */ void bio_trim(struct bio *bio, sector_t offset, sector_t size) { /* We should never trim an atomic write */ if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) return; if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || offset + size > bio_sectors(bio))) return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) return; bio_advance(bio, offset << 9); bio->bi_iter.bi_size = size; if (bio_integrity(bio)) bio_integrity_trim(bio); } EXPORT_SYMBOL_GPL(bio_trim); /* * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ int biovec_init_pool(mempool_t *pool, int pool_entries) { struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } /* * bioset_exit - exit a bioset initialized with bioset_init() * * May be called on a zeroed but uninitialized bioset (i.e. allocated with * kzalloc()). */ void bioset_exit(struct bio_set *bs) { bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; } EXPORT_SYMBOL(bioset_exit); /** * bioset_init - Initialize a bio_set * @bs: pool to initialize * @pool_size: Number of bio and bio_vecs to cache in the mempool * @front_pad: Number of bytes to allocate in front of the returned bio * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS * and %BIOSET_NEED_RESCUER * * Description: * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller * to ask for a number of bytes to be allocated in front of the bio. * Front pad allocation is useful for embedding the bio inside * another structure, to avoid allocating extra data to go with the bio. * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, unsigned int pool_size, unsigned int front_pad, int flags) { bs->front_pad = front_pad; if (flags & BIOSET_NEED_BVECS) bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); else bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab)) goto bad; if ((flags & BIOSET_NEED_BVECS) && biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; if (flags & BIOSET_NEED_RESCUER) { bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; } if (flags & BIOSET_PERCPU_CACHE) { bs->cache = alloc_percpu(struct bio_alloc_cache); if (!bs->cache) goto bad; cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); } return 0; bad: bioset_exit(bs); return -ENOMEM; } EXPORT_SYMBOL(bioset_init); static int __init init_bio(void) { int i; BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; bvs->slab = kmem_cache_create(bvs->name, bvs->nr_vecs * sizeof(struct bio_vec), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); return 0; } subsys_initcall(init_bio);
30 29 47 42 5 5 5 6 16 16 11 7 10 1 1 13 7 36 36 16 30 13 19 16 27 27 20 4 1 1 2 72 2 49 72 72 72 72 165 10 78 96 16 30 28 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Neil Brown 2002 * Copyright (C) Christoph Hellwig 2007 * * This file contains the code mapping from inodes to NFS file handles, * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here * take a look at Documentation/filesystems/nfs/exporting.rst. */ #include <linux/exportfs.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/cred.h> #define dprintk(fmt, args...) pr_debug(fmt, ##args) static int get_name(const struct path *path, char *name, struct dentry *child); static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir, char *name, struct dentry *child) { const struct export_operations *nop = dir->d_sb->s_export_op; struct path path = {.mnt = mnt, .dentry = dir}; if (nop->get_name) return nop->get_name(dir, name, child); else return get_name(&path, name, child); } /* * Check if the dentry or any of it's aliases is acceptable. */ static struct dentry * find_acceptable_alias(struct dentry *result, int (*acceptable)(void *context, struct dentry *dentry), void *context) { struct dentry *dentry, *toput = NULL; struct inode *inode; if (acceptable(context, result)) return result; inode = result->d_inode; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) dput(toput); if (dentry != result && acceptable(context, dentry)) { dput(result); return dentry; } spin_lock(&inode->i_lock); toput = dentry; } spin_unlock(&inode->i_lock); if (toput) dput(toput); return NULL; } static bool dentry_connected(struct dentry *dentry) { dget(dentry); while (dentry->d_flags & DCACHE_DISCONNECTED) { struct dentry *parent = dget_parent(dentry); dput(dentry); if (dentry == parent) { dput(parent); return false; } dentry = parent; } dput(dentry); return true; } static void clear_disconnected(struct dentry *dentry) { dget(dentry); while (dentry->d_flags & DCACHE_DISCONNECTED) { struct dentry *parent = dget_parent(dentry); WARN_ON_ONCE(IS_ROOT(dentry)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_DISCONNECTED; spin_unlock(&dentry->d_lock); dput(dentry); dentry = parent; } dput(dentry); } /* * Reconnect a directory dentry with its parent. * * This can return a dentry, or NULL, or an error. * * In the first case the returned dentry is the parent of the given * dentry, and may itself need to be reconnected to its parent. * * In the NULL case, a concurrent VFS operation has either renamed or * removed this directory. The concurrent operation has reconnected our * dentry, so we no longer need to. */ static struct dentry *reconnect_one(struct vfsmount *mnt, struct dentry *dentry, char *nbuf) { struct dentry *parent; struct dentry *tmp; int err; parent = ERR_PTR(-EACCES); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); if (IS_ERR(parent)) { dprintk("get_parent of %lu failed, err %ld\n", dentry->d_inode->i_ino, PTR_ERR(parent)); return parent; } dprintk("%s: find name of %lu in %lu\n", __func__, dentry->d_inode->i_ino, parent->d_inode->i_ino); err = exportfs_get_name(mnt, parent, nbuf, dentry); if (err == -ENOENT) goto out_reconnected; if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); tmp = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), parent); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); goto out_err; } if (tmp != dentry) { /* * Somebody has renamed it since exportfs_get_name(); * great, since it could've only been renamed if it * got looked up and thus connected, and it would * remain connected afterwards. We are done. */ dput(tmp); goto out_reconnected; } dput(tmp); if (IS_ROOT(dentry)) { err = -ESTALE; goto out_err; } return parent; out_err: dput(parent); return ERR_PTR(err); out_reconnected: dput(parent); /* * Someone must have renamed our entry into another parent, in * which case it has been reconnected by the rename. * * Or someone removed it entirely, in which case filehandle * lookup will succeed but the directory is now IS_DEAD and * subsequent operations on it will fail. * * Alternatively, maybe there was no race at all, and the * filesystem is just corrupt and gave us a parent that doesn't * actually contain any entry pointing to this inode. So, * double check that this worked and return -ESTALE if not: */ if (!dentry_connected(dentry)) return ERR_PTR(-ESTALE); return NULL; } /* * Make sure target_dir is fully connected to the dentry tree. * * On successful return, DCACHE_DISCONNECTED will be cleared on * target_dir, and target_dir->d_parent->...->d_parent will reach the * root of the filesystem. * * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected. * But the converse is not true: target_dir may have DCACHE_DISCONNECTED * set but already be connected. In that case we'll verify the * connection to root and then clear the flag. * * Note that target_dir could be removed by a concurrent operation. In * that case reconnect_path may still succeed with target_dir fully * connected, but further operations using the filehandle will fail when * necessary (due to S_DEAD being set on the directory). */ static int reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) { struct dentry *dentry, *parent; dentry = dget(target_dir); while (dentry->d_flags & DCACHE_DISCONNECTED) { BUG_ON(dentry == mnt->mnt_sb->s_root); if (IS_ROOT(dentry)) parent = reconnect_one(mnt, dentry, nbuf); else parent = dget_parent(dentry); if (!parent) break; dput(dentry); if (IS_ERR(parent)) return PTR_ERR(parent); dentry = parent; } dput(dentry); clear_disconnected(target_dir); return 0; } struct getdents_callback { struct dir_context ctx; char *name; /* name that was found. It already points to a buffer NAME_MAX+1 is size */ u64 ino; /* the inum we are looking for */ int found; /* inode matched? */ int sequence; /* sequence counter */ }; /* * A rather strange filldir function to capture * the name matching the specified inode number. */ static bool filldir_one(struct dir_context *ctx, const char *name, int len, loff_t pos, u64 ino, unsigned int d_type) { struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); buf->sequence++; if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; return false; // no more } return true; } /** * get_name - default export_operations->get_name function * @path: the directory in which to find a name * @name: a pointer to a %NAME_MAX+1 char buffer to store the name * @child: the dentry for the child directory. * * calls readdir on the parent until it finds an entry with * the same inode number as the child, and returns that. */ static int get_name(const struct path *path, char *name, struct dentry *child) { const struct cred *cred = current_cred(); struct inode *dir = path->dentry->d_inode; int error; struct file *file; struct kstat stat; struct path child_path = { .mnt = path->mnt, .dentry = child, }; struct getdents_callback buffer = { .ctx.actor = filldir_one, .ctx.count = INT_MAX, .name = name, }; error = -ENOTDIR; if (!dir || !S_ISDIR(dir->i_mode)) goto out; error = -EINVAL; if (!dir->i_fop) goto out; /* * inode->i_ino is unsigned long, kstat->ino is u64, so the * former would be insufficient on 32-bit hosts when the * filesystem supports 64-bit inode numbers. So we need to * actually call ->getattr, not just read i_ino: */ error = vfs_getattr_nosec(&child_path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (error) return error; buffer.ino = stat.ino; /* * Open the directory ... */ file = dentry_open(path, O_RDONLY, cred); error = PTR_ERR(file); if (IS_ERR(file)) goto out; error = -EINVAL; if (!file->f_op->iterate_shared) goto out_close; buffer.sequence = 0; while (1) { int old_seq = buffer.sequence; error = iterate_dir(file, &buffer.ctx); if (buffer.found) { error = 0; break; } if (error < 0) break; error = -ENOENT; if (old_seq == buffer.sequence) break; } out_close: fput(file); out: return error; } #define FILEID_INO64_GEN_LEN 3 /** * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id * @inode: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there (in 4 byte units) * * This generic function is used to encode a non-decodeable file id for * fanotify for filesystems that do not support NFS export. */ static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid, int *max_len) { if (*max_len < FILEID_INO64_GEN_LEN) { *max_len = FILEID_INO64_GEN_LEN; return FILEID_INVALID; } fid->i64.ino = inode->i_ino; fid->i64.gen = inode->i_generation; *max_len = FILEID_INO64_GEN_LEN; return FILEID_INO64_GEN; } /** * exportfs_encode_inode_fh - encode a file handle from inode * @inode: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there * @parent: parent directory inode, if wanted * @flags: properties of the requested file handle * * Returns an enum fid_type or a negative errno. */ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags) { const struct export_operations *nop = inode->i_sb->s_export_op; enum fid_type type; if (!exportfs_can_encode_fh(nop, flags)) return -EOPNOTSUPP; if (!nop && (flags & EXPORT_FH_FID)) type = exportfs_encode_ino64_fid(inode, fid, max_len); else type = nop->encode_fh(inode, fid->raw, max_len, parent); if (type > 0 && FILEID_USER_FLAGS(type)) { pr_warn_once("%s: unexpected fh type value 0x%x from fstype %s.\n", __func__, type, inode->i_sb->s_type->name); return -EINVAL; } return type; } EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); /** * exportfs_encode_fh - encode a file handle from dentry * @dentry: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there * @flags: properties of the requested file handle * * Returns an enum fid_type or a negative errno. */ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int flags) { int error; struct dentry *p = NULL; struct inode *inode = dentry->d_inode, *parent = NULL; if ((flags & EXPORT_FH_CONNECTABLE) && !S_ISDIR(inode->i_mode)) { p = dget_parent(dentry); /* * note that while p might've ceased to be our parent already, * it's still pinned by and still positive. */ parent = p->d_inode; } error = exportfs_encode_inode_fh(inode, fid, max_len, parent, flags); dput(p); return error; } EXPORT_SYMBOL_GPL(exportfs_encode_fh); struct dentry * exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context) { const struct export_operations *nop = mnt->mnt_sb->s_export_op; struct dentry *result, *alias; char nbuf[NAME_MAX+1]; int err; if (fileid_type < 0 || FILEID_USER_FLAGS(fileid_type)) return ERR_PTR(-EINVAL); /* * Try to get any dentry for the given file handle from the filesystem. */ if (!exportfs_can_decode_fh(nop)) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (IS_ERR_OR_NULL(result)) return result; if ((flags & EXPORT_FH_DIR_ONLY) && !d_is_dir(result)) { err = -ENOTDIR; goto err_result; } /* * If no acceptance criteria was specified by caller, a disconnected * dentry is also accepatable. Callers may use this mode to query if * file handle is stale or to get a reference to an inode without * risking the high overhead caused by directory reconnect. */ if (!acceptable) return result; if (d_is_dir(result)) { /* * This request is for a directory. * * On the positive side there is only one dentry for each * directory inode. On the negative side this implies that we * to ensure our dentry is connected all the way up to the * filesystem root. */ if (result->d_flags & DCACHE_DISCONNECTED) { err = reconnect_path(mnt, result, nbuf); if (err) goto err_result; } if (!acceptable(context, result)) { err = -EACCES; goto err_result; } return result; } else { /* * It's not a directory. Life is a little more complicated. */ struct dentry *target_dir, *nresult; /* * See if either the dentry we just got from the filesystem * or any alias for it is acceptable. This is always true * if this filesystem is exported without the subtreecheck * option. If the filesystem is exported with the subtree * check option there's a fair chance we need to look at * the parent directory in the file handle and make sure * it's connected to the filesystem root. */ alias = find_acceptable_alias(result, acceptable, context); if (alias) return alias; /* * Try to extract a dentry for the parent directory from the * file handle. If this fails we'll have to give up. */ err = -ESTALE; if (!nop->fh_to_parent) goto err_result; target_dir = nop->fh_to_parent(mnt->mnt_sb, fid, fh_len, fileid_type); if (!target_dir) goto err_result; err = PTR_ERR(target_dir); if (IS_ERR(target_dir)) goto err_result; /* * And as usual we need to make sure the parent directory is * connected to the filesystem root. The VFS really doesn't * like disconnected directories.. */ err = reconnect_path(mnt, target_dir, nbuf); if (err) { dput(target_dir); goto err_result; } /* * Now that we've got both a well-connected parent and a * dentry for the inode we're after, make sure that our * inode is actually connected to the parent. */ err = exportfs_get_name(mnt, target_dir, nbuf, result); if (err) { dput(target_dir); goto err_result; } nresult = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), target_dir); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); nresult = ERR_PTR(-ESTALE); } } /* * At this point we are done with the parent, but it's pinned * by the child dentry anyway. */ dput(target_dir); if (IS_ERR(nresult)) { err = PTR_ERR(nresult); goto err_result; } dput(result); result = nresult; /* * And finally make sure the dentry is actually acceptable * to NFSD. */ alias = find_acceptable_alias(result, acceptable, context); if (!alias) { err = -EACCES; goto err_result; } return alias; } err_result: dput(result); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw); struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), void *context) { struct dentry *ret; ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 0, acceptable, context); if (IS_ERR_OR_NULL(ret)) { if (ret == ERR_PTR(-ENOMEM)) return ret; return ERR_PTR(-ESTALE); } return ret; } EXPORT_SYMBOL_GPL(exportfs_decode_fh); MODULE_DESCRIPTION("Code mapping from inodes to file handles"); MODULE_LICENSE("GPL");
3 2 19 5 14 14 113 113 28 180 1 28 194 1 6 35 153 71 1 1 36 18 28 213 3 1 10 24 200 4 1 10 31 327 1 207 7 41 89 359 16 266 9 61 15 15 12 253 3 3 325 15 247 94 1 253 9 2 54 10 1 4 2 8 10 328 316 8 6 4 341 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 // SPDX-License-Identifier: GPL-2.0-or-later /* Filesystem access-by-fd. * * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/anon_inodes.h> #include <linux/namei.h> #include <linux/file.h> #include <uapi/linux/mount.h> #include "internal.h" #include "mount.h" static inline const char *fetch_message_locked(struct fc_log *log, size_t len, bool *need_free) { const char *p; int index; if (unlikely(log->head == log->tail)) return ERR_PTR(-ENODATA); index = log->tail & (ARRAY_SIZE(log->buffer) - 1); p = log->buffer[index]; if (unlikely(strlen(p) > len)) return ERR_PTR(-EMSGSIZE); log->buffer[index] = NULL; *need_free = log->need_free & (1 << index); log->need_free &= ~(1 << index); log->tail++; return p; } /* * Allow the user to read back any error, warning or informational messages. * Only one message is returned for each read(2) call. */ static ssize_t fscontext_read(struct file *file, char __user *_buf, size_t len, loff_t *pos) { struct fs_context *fc = file->private_data; ssize_t err; const char *p __free(kfree) = NULL, *message; bool need_free; int n; err = mutex_lock_interruptible(&fc->uapi_mutex); if (err < 0) return err; message = fetch_message_locked(fc->log.log, len, &need_free); mutex_unlock(&fc->uapi_mutex); if (IS_ERR(message)) return PTR_ERR(message); if (need_free) p = message; n = strlen(message); if (copy_to_user(_buf, message, n)) return -EFAULT; return n; } static int fscontext_release(struct inode *inode, struct file *file) { struct fs_context *fc = file->private_data; if (fc) { file->private_data = NULL; put_fs_context(fc); } return 0; } const struct file_operations fscontext_fops = { .read = fscontext_read, .release = fscontext_release, }; /* * Attach a filesystem context to a file and an fd. */ static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags) { int fd; fd = anon_inode_getfd("[fscontext]", &fscontext_fops, fc, O_RDWR | o_flags); if (fd < 0) put_fs_context(fc); return fd; } static int fscontext_alloc_log(struct fs_context *fc) { fc->log.log = kzalloc(sizeof(*fc->log.log), GFP_KERNEL); if (!fc->log.log) return -ENOMEM; refcount_set(&fc->log.log->usage, 1); fc->log.log->owner = fc->fs_type->owner; return 0; } /* * Open a filesystem by name so that it can be configured for mounting. * * We are allowed to specify a container in which the filesystem will be * opened, thereby indicating which namespaces will be used (notably, which * network namespace will be used for network filesystems). */ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) { struct file_system_type *fs_type; struct fs_context *fc; const char *fs_name; int ret; if (!may_mount()) return -EPERM; if (flags & ~FSOPEN_CLOEXEC) return -EINVAL; fs_name = strndup_user(_fs_name, PAGE_SIZE); if (IS_ERR(fs_name)) return PTR_ERR(fs_name); fs_type = get_fs_type(fs_name); kfree(fs_name); if (!fs_type) return -ENODEV; fc = fs_context_for_mount(fs_type, 0); put_filesystem(fs_type); if (IS_ERR(fc)) return PTR_ERR(fc); fc->phase = FS_CONTEXT_CREATE_PARAMS; ret = fscontext_alloc_log(fc); if (ret < 0) goto err_fc; return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0); err_fc: put_fs_context(fc); return ret; } /* * Pick a superblock into a context for reconfiguration. */ SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags) { struct fs_context *fc; struct path target; unsigned int lookup_flags; int ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSPICK_CLOEXEC | FSPICK_SYMLINK_NOFOLLOW | FSPICK_NO_AUTOMOUNT | FSPICK_EMPTY_PATH)) != 0) return -EINVAL; lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; if (flags & FSPICK_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & FSPICK_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & FSPICK_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; ret = user_path_at(dfd, path, lookup_flags, &target); if (ret < 0) goto err; ret = -EINVAL; if (target.mnt->mnt_root != target.dentry) goto err_path; fc = fs_context_for_reconfigure(target.dentry, 0, 0); if (IS_ERR(fc)) { ret = PTR_ERR(fc); goto err_path; } fc->phase = FS_CONTEXT_RECONF_PARAMS; ret = fscontext_alloc_log(fc); if (ret < 0) goto err_fc; path_put(&target); return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0); err_fc: put_fs_context(fc); err_path: path_put(&target); err: return ret; } static int vfs_cmd_create(struct fs_context *fc, bool exclusive) { struct super_block *sb; int ret; if (fc->phase != FS_CONTEXT_CREATE_PARAMS) return -EBUSY; if (!mount_capable(fc)) return -EPERM; fc->phase = FS_CONTEXT_CREATING; fc->exclusive = exclusive; ret = vfs_get_tree(fc); if (ret) { fc->phase = FS_CONTEXT_FAILED; return ret; } sb = fc->root->d_sb; ret = security_sb_kern_mount(sb); if (unlikely(ret)) { fc_drop_locked(fc); fc->phase = FS_CONTEXT_FAILED; return ret; } /* vfs_get_tree() callchains will have grabbed @s_umount */ up_write(&sb->s_umount); fc->phase = FS_CONTEXT_AWAITING_MOUNT; return 0; } static int vfs_cmd_reconfigure(struct fs_context *fc) { struct super_block *sb; int ret; if (fc->phase != FS_CONTEXT_RECONF_PARAMS) return -EBUSY; fc->phase = FS_CONTEXT_RECONFIGURING; sb = fc->root->d_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { fc->phase = FS_CONTEXT_FAILED; return -EPERM; } down_write(&sb->s_umount); ret = reconfigure_super(fc); up_write(&sb->s_umount); if (ret) { fc->phase = FS_CONTEXT_FAILED; return ret; } vfs_clean_context(fc); return 0; } /* * Check the state and apply the configuration. Note that this function is * allowed to 'steal' the value by setting param->xxx to NULL before returning. */ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, struct fs_parameter *param) { int ret; ret = finish_clean_context(fc); if (ret) return ret; switch (cmd) { case FSCONFIG_CMD_CREATE: return vfs_cmd_create(fc, false); case FSCONFIG_CMD_CREATE_EXCL: return vfs_cmd_create(fc, true); case FSCONFIG_CMD_RECONFIGURE: return vfs_cmd_reconfigure(fc); default: if (fc->phase != FS_CONTEXT_CREATE_PARAMS && fc->phase != FS_CONTEXT_RECONF_PARAMS) return -EBUSY; return vfs_parse_fs_param(fc, param); } } /** * sys_fsconfig - Set parameters and trigger actions on a context * @fd: The filesystem context to act upon * @cmd: The action to take * @_key: Where appropriate, the parameter key to set * @_value: Where appropriate, the parameter value to set * @aux: Additional information for the value * * This system call is used to set parameters on a context, including * superblock settings, data source and security labelling. * * Actions include triggering the creation of a superblock and the * reconfiguration of the superblock attached to the specified context. * * When setting a parameter, @cmd indicates the type of value being proposed * and @_key indicates the parameter to be altered. * * @_value and @aux are used to specify the value, should a value be required: * * (*) fsconfig_set_flag: No value is specified. The parameter must be boolean * in nature. The key may be prefixed with "no" to invert the * setting. @_value must be NULL and @aux must be 0. * * (*) fsconfig_set_string: A string value is specified. The parameter can be * expecting boolean, integer, string or take a path. A conversion to an * appropriate type will be attempted (which may include looking up as a * path). @_value points to a NUL-terminated string and @aux must be 0. * * (*) fsconfig_set_binary: A binary blob is specified. @_value points to the * blob and @aux indicates its size. The parameter must be expecting a * blob. * * (*) fsconfig_set_path: A non-empty path is specified. The parameter must be * expecting a path object. @_value points to a NUL-terminated string that * is the path and @aux is a file descriptor at which to start a relative * lookup or AT_FDCWD. * * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH * implied. * * (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be * NULL and @aux indicates the file descriptor. */ SYSCALL_DEFINE5(fsconfig, int, fd, unsigned int, cmd, const char __user *, _key, const void __user *, _value, int, aux) { struct fs_context *fc; int ret; int lookup_flags = 0; struct fs_parameter param = { .type = fs_value_is_undefined, }; if (fd < 0) return -EINVAL; switch (cmd) { case FSCONFIG_SET_FLAG: if (!_key || _value || aux) return -EINVAL; break; case FSCONFIG_SET_STRING: if (!_key || !_value || aux) return -EINVAL; break; case FSCONFIG_SET_BINARY: if (!_key || !_value || aux <= 0 || aux > 1024 * 1024) return -EINVAL; break; case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: if (!_key || !_value || (aux != AT_FDCWD && aux < 0)) return -EINVAL; break; case FSCONFIG_SET_FD: if (!_key || _value || aux < 0) return -EINVAL; break; case FSCONFIG_CMD_CREATE: case FSCONFIG_CMD_CREATE_EXCL: case FSCONFIG_CMD_RECONFIGURE: if (_key || _value || aux) return -EINVAL; break; default: return -EOPNOTSUPP; } CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &fscontext_fops) return -EINVAL; fc = fd_file(f)->private_data; if (fc->ops == &legacy_fs_context_ops) { switch (cmd) { case FSCONFIG_SET_BINARY: case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: case FSCONFIG_SET_FD: case FSCONFIG_CMD_CREATE_EXCL: return -EOPNOTSUPP; } } if (_key) { param.key = strndup_user(_key, 256); if (IS_ERR(param.key)) return PTR_ERR(param.key); } switch (cmd) { case FSCONFIG_SET_FLAG: param.type = fs_value_is_flag; break; case FSCONFIG_SET_STRING: param.type = fs_value_is_string; param.string = strndup_user(_value, 256); if (IS_ERR(param.string)) { ret = PTR_ERR(param.string); goto out_key; } param.size = strlen(param.string); break; case FSCONFIG_SET_BINARY: param.type = fs_value_is_blob; param.size = aux; param.blob = memdup_user_nul(_value, aux); if (IS_ERR(param.blob)) { ret = PTR_ERR(param.blob); goto out_key; } break; case FSCONFIG_SET_PATH_EMPTY: lookup_flags = LOOKUP_EMPTY; fallthrough; case FSCONFIG_SET_PATH: param.type = fs_value_is_filename; param.name = getname_flags(_value, lookup_flags); if (IS_ERR(param.name)) { ret = PTR_ERR(param.name); goto out_key; } param.dirfd = aux; param.size = strlen(param.name->name); break; case FSCONFIG_SET_FD: param.type = fs_value_is_file; ret = -EBADF; param.file = fget_raw(aux); if (!param.file) goto out_key; param.dirfd = aux; break; default: break; } ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret == 0) { ret = vfs_fsconfig_locked(fc, cmd, &param); mutex_unlock(&fc->uapi_mutex); } /* Clean up the our record of any value that we obtained from * userspace. Note that the value may have been stolen by the LSM or * filesystem, in which case the value pointer will have been cleared. */ switch (cmd) { case FSCONFIG_SET_STRING: case FSCONFIG_SET_BINARY: kfree(param.string); break; case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: if (param.name) putname(param.name); break; case FSCONFIG_SET_FD: if (param.file) fput(param.file); break; default: break; } out_key: kfree(param.key); return ret; }
9 2 3 2 2 3 3 2 1 188 186 14 2 1 31 4 3 2 7 29 7 4 4 3 3 3 1 1 1 11 11 41 41 41 41 44 1 10 1 1 29 1 1 41 41 2 1 1 1 22 29 22 8 237 17 11 2 5 4 3 4 48 1 29 18 5 13 17 17 17 1 2 3 11 12 2 32 32 1 3 20 13 29 178 234 3 232 235 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); } static int xfrmi_create(struct net *net, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->p.link); } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; err = gro_cells_init(&xi->gro_cells, dev); if (err) return err; dev->lltx = true; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct xfrm_if_parms p = {}; struct xfrm_if *xi; struct net *net; int err; net = params->link_net ? : dev_net(dev); xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(net, dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md || xi->p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->net); } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_net_dereference(net, *xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, dev_to_kill); } static struct pernet_operations xfrmi_net_ops = { .exit_rtnl = xfrmi_exit_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface");
119 2 226 2 111 111 84 43 111 111 111 44 44 44 308 7 3 308 1 308 84 83 84 227 227 195 196 231 20 224 211 199 1 3 1 2 42 42 42 93 9 90 10 6 19 52 53 53 165 133 53 4 7 2 2 4 4 2 51 30 21 32 18 11 21 24 20 20 20 28 28 28 28 28 28 38 6 32 2 28 26 1 4 21 78 13 31 21 22 23 95 39 81 6 6 6 126 32 2 119 11 118 1 26 183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 /* * linux/drivers/video/vgacon.c -- Low level VGA based console driver * * Created 28 Sep 1997 by Geert Uytterhoeven * * Rewritten by Martin Mares <mj@ucw.cz>, July 1998 * * This file is based on the old console.c, vga.c and vesa_blank.c drivers. * * Copyright (C) 1991, 1992 Linus Torvalds * 1995 Jay Estabrook * * User definable mapping table and font loading by Eugene G. Crosser, * <crosser@average.org> * * Improved loadable font/UTF-8 support by H. Peter Anvin * Feb-Sep 1995 <peter.anvin@linux.org> * * Colour palette handling, by Simon Tatham * 17-Jun-95 <sgt20@cam.ac.uk> * * if 512 char mode is already enabled don't re-enable it, * because it causes screen to flicker, by Mitja Horvat * 5-May-96 <mitja.horvat@guest.arnes.si> * * Use 2 outw instead of 4 outb_p to reduce erroneous text * flashing on RHS of screen during heavy console scrolling . * Oct 1996, Paul Gortmaker. * * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/module.h> #include <linux/types.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/console.h> #include <linux/string.h> #include <linux/kd.h> #include <linux/slab.h> #include <linux/vt_kern.h> #include <linux/sched.h> #include <linux/selection.h> #include <linux/spinlock.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/screen_info.h> #include <video/vga.h> #include <asm/io.h> static DEFINE_RAW_SPINLOCK(vga_lock); static int cursor_size_lastfrom; static int cursor_size_lastto; static u32 vgacon_xres; static u32 vgacon_yres; static struct vgastate vgastate; #define BLANK 0x0020 #define VGA_FONTWIDTH 8 /* VGA does not support fontwidths != 8 */ /* * Interface used by the world */ static bool vgacon_set_origin(struct vc_data *c); static struct uni_pagedict *vgacon_uni_pagedir; static int vgacon_refcount; /* Description of the hardware situation */ static unsigned long vga_vram_base __read_mostly; /* Base of video memory */ static unsigned long vga_vram_end __read_mostly; /* End of video memory */ static unsigned int vga_vram_size __read_mostly; /* Size of video memory */ static u16 vga_video_port_reg __read_mostly; /* Video register select port */ static u16 vga_video_port_val __read_mostly; /* Video register value port */ static unsigned int vga_video_num_columns; /* Number of text columns */ static unsigned int vga_video_num_lines; /* Number of text lines */ static bool vga_can_do_color; /* Do we support colors? */ static unsigned int vga_default_font_height __read_mostly; /* Height of default screen font */ static unsigned char vga_video_type __read_mostly; /* Card type */ static enum vesa_blank_mode vga_vesa_blanked; static bool vga_palette_blanked; static bool vga_is_gfx; static bool vga_512_chars; static int vga_video_font_height; static int vga_scan_lines __read_mostly; static unsigned int vga_rolled_over; /* last vc_origin offset before wrap */ static struct screen_info *vga_si; static bool vga_hardscroll_enabled; static bool vga_hardscroll_user_enable = true; static int __init no_scroll(char *str) { /* * Disabling scrollback is required for the Braillex ib80-piezo * Braille reader made by F.H. Papenmeier (Germany). * Use the "no-scroll" bootflag. */ vga_hardscroll_user_enable = vga_hardscroll_enabled = false; return 1; } __setup("no-scroll", no_scroll); /* * By replacing the four outb_p with two back to back outw, we can reduce * the window of opportunity to see text mislocated to the RHS of the * console during heavy scrolling activity. However there is the remote * possibility that some pre-dinosaur hardware won't like the back to back * I/O. Since the Xservers get away with it, we should be able to as well. */ static inline void write_vga(unsigned char reg, unsigned int val) { unsigned int v1, v2; unsigned long flags; /* * ddprintk might set the console position from interrupt * handlers, thus the write has to be IRQ-atomic. */ raw_spin_lock_irqsave(&vga_lock, flags); v1 = reg + (val & 0xff00); v2 = reg + 1 + ((val << 8) & 0xff00); outw(v1, vga_video_port_reg); outw(v2, vga_video_port_reg); raw_spin_unlock_irqrestore(&vga_lock, flags); } static inline void vga_set_mem_top(struct vc_data *c) { write_vga(12, (c->vc_visible_origin - vga_vram_base) / 2); } static void vgacon_scrolldelta(struct vc_data *c, int lines) { unsigned long scr_end = c->vc_scr_end - vga_vram_base; unsigned long vorigin = c->vc_visible_origin - vga_vram_base; unsigned long origin = c->vc_origin - vga_vram_base; int margin = c->vc_size_row * 4; int from, wrap, from_off, avail; /* Turn scrollback off */ if (!lines) { c->vc_visible_origin = c->vc_origin; return; } /* Do we have already enough to allow jumping from 0 to the end? */ if (vga_rolled_over > scr_end + margin) { from = scr_end; wrap = vga_rolled_over + c->vc_size_row; } else { from = 0; wrap = vga_vram_size; } from_off = (vorigin - from + wrap) % wrap + lines * c->vc_size_row; avail = (origin - from + wrap) % wrap; /* Only a little piece would be left? Show all incl. the piece! */ if (avail < 2 * margin) margin = 0; if (from_off < margin) from_off = 0; if (from_off > avail - margin) from_off = avail; c->vc_visible_origin = vga_vram_base + (from + from_off) % wrap; vga_set_mem_top(c); } static void vgacon_restore_screen(struct vc_data *c) { if (c->vc_origin != c->vc_visible_origin) vgacon_scrolldelta(c, 0); } static const char *vgacon_startup(void) { const char *display_desc = NULL; u16 saved1, saved2; volatile u16 *p; if (!vga_si || vga_si->orig_video_isVGA == VIDEO_TYPE_VLFB || vga_si->orig_video_isVGA == VIDEO_TYPE_EFI) { no_vga: #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; return conswitchp->con_startup(); #else return NULL; #endif } /* vga_si reasonably initialized? */ if ((vga_si->orig_video_lines == 0) || (vga_si->orig_video_cols == 0)) goto no_vga; /* VGA16 modes are not handled by VGACON */ if ((vga_si->orig_video_mode == 0x0D) || /* 320x200/4 */ (vga_si->orig_video_mode == 0x0E) || /* 640x200/4 */ (vga_si->orig_video_mode == 0x10) || /* 640x350/4 */ (vga_si->orig_video_mode == 0x12) || /* 640x480/4 */ (vga_si->orig_video_mode == 0x6A)) /* 800x600/4 (VESA) */ goto no_vga; vga_video_num_lines = vga_si->orig_video_lines; vga_video_num_columns = vga_si->orig_video_cols; vgastate.vgabase = NULL; if (vga_si->orig_video_mode == 7) { /* Monochrome display */ vga_vram_base = 0xb0000; vga_video_port_reg = VGA_CRT_IM; vga_video_port_val = VGA_CRT_DM; if ((vga_si->orig_video_ega_bx & 0xff) != 0x10) { static struct resource ega_console_resource = { .name = "ega", .flags = IORESOURCE_IO, .start = 0x3B0, .end = 0x3BF }; vga_video_type = VIDEO_TYPE_EGAM; vga_vram_size = 0x8000; display_desc = "EGA+"; request_resource(&ioport_resource, &ega_console_resource); } else { static struct resource mda1_console_resource = { .name = "mda", .flags = IORESOURCE_IO, .start = 0x3B0, .end = 0x3BB }; static struct resource mda2_console_resource = { .name = "mda", .flags = IORESOURCE_IO, .start = 0x3BF, .end = 0x3BF }; vga_video_type = VIDEO_TYPE_MDA; vga_vram_size = 0x2000; display_desc = "*MDA"; request_resource(&ioport_resource, &mda1_console_resource); request_resource(&ioport_resource, &mda2_console_resource); vga_video_font_height = 14; } } else { /* If not, it is color. */ vga_can_do_color = true; vga_vram_base = 0xb8000; vga_video_port_reg = VGA_CRT_IC; vga_video_port_val = VGA_CRT_DC; if ((vga_si->orig_video_ega_bx & 0xff) != 0x10) { int i; vga_vram_size = 0x8000; if (!vga_si->orig_video_isVGA) { static struct resource ega_console_resource = { .name = "ega", .flags = IORESOURCE_IO, .start = 0x3C0, .end = 0x3DF }; vga_video_type = VIDEO_TYPE_EGAC; display_desc = "EGA"; request_resource(&ioport_resource, &ega_console_resource); } else { static struct resource vga_console_resource = { .name = "vga+", .flags = IORESOURCE_IO, .start = 0x3C0, .end = 0x3DF }; vga_video_type = VIDEO_TYPE_VGAC; display_desc = "VGA+"; request_resource(&ioport_resource, &vga_console_resource); /* * Normalise the palette registers, to point * the 16 screen colours to the first 16 * DAC entries. */ for (i = 0; i < 16; i++) { inb_p(VGA_IS1_RC); outb_p(i, VGA_ATT_W); outb_p(i, VGA_ATT_W); } outb_p(0x20, VGA_ATT_W); /* * Now set the DAC registers back to their * default values */ for (i = 0; i < 16; i++) { outb_p(color_table[i], VGA_PEL_IW); outb_p(default_red[i], VGA_PEL_D); outb_p(default_grn[i], VGA_PEL_D); outb_p(default_blu[i], VGA_PEL_D); } } } else { static struct resource cga_console_resource = { .name = "cga", .flags = IORESOURCE_IO, .start = 0x3D4, .end = 0x3D5 }; vga_video_type = VIDEO_TYPE_CGA; vga_vram_size = 0x2000; display_desc = "*CGA"; request_resource(&ioport_resource, &cga_console_resource); vga_video_font_height = 8; } } vga_vram_base = VGA_MAP_MEM(vga_vram_base, vga_vram_size); vga_vram_end = vga_vram_base + vga_vram_size; /* * Find out if there is a graphics card present. * Are there smarter methods around? */ p = (volatile u16 *) vga_vram_base; saved1 = scr_readw(p); saved2 = scr_readw(p + 1); scr_writew(0xAA55, p); scr_writew(0x55AA, p + 1); if (scr_readw(p) != 0xAA55 || scr_readw(p + 1) != 0x55AA) { scr_writew(saved1, p); scr_writew(saved2, p + 1); goto no_vga; } scr_writew(0x55AA, p); scr_writew(0xAA55, p + 1); if (scr_readw(p) != 0x55AA || scr_readw(p + 1) != 0xAA55) { scr_writew(saved1, p); scr_writew(saved2, p + 1); goto no_vga; } scr_writew(saved1, p); scr_writew(saved2, p + 1); if (vga_video_type == VIDEO_TYPE_EGAC || vga_video_type == VIDEO_TYPE_VGAC || vga_video_type == VIDEO_TYPE_EGAM) { vga_hardscroll_enabled = vga_hardscroll_user_enable; vga_default_font_height = vga_si->orig_video_points; vga_video_font_height = vga_si->orig_video_points; /* This may be suboptimal but is a safe bet - go with it */ vga_scan_lines = vga_video_font_height * vga_video_num_lines; } vgacon_xres = vga_si->orig_video_cols * VGA_FONTWIDTH; vgacon_yres = vga_scan_lines; return display_desc; } static void vgacon_init(struct vc_data *c, bool init) { struct uni_pagedict *p; /* * We cannot be loaded as a module, therefore init will be 1 * if we are the default console, however if we are a fallback * console, for example if fbcon has failed registration, then * init will be 0, so we need to make sure our boot parameters * have been copied to the console structure for vgacon_resize * ultimately called by vc_resize. Any subsequent calls to * vgacon_init init will have init set to 0 too. */ c->vc_can_do_color = vga_can_do_color; c->vc_scan_lines = vga_scan_lines; c->vc_font.height = c->vc_cell_height = vga_video_font_height; /* set dimensions manually if init is true since vc_resize() will fail */ if (init) { c->vc_cols = vga_video_num_columns; c->vc_rows = vga_video_num_lines; } else vc_resize(c, vga_video_num_columns, vga_video_num_lines); c->vc_complement_mask = 0x7700; if (vga_512_chars) c->vc_hi_font_mask = 0x0800; p = *c->uni_pagedict_loc; if (c->uni_pagedict_loc != &vgacon_uni_pagedir) { con_free_unimap(c); c->uni_pagedict_loc = &vgacon_uni_pagedir; vgacon_refcount++; } if (!vgacon_uni_pagedir && p) con_set_default_unimap(c); /* Only set the default if the user didn't deliberately override it */ if (global_cursor_default == -1) global_cursor_default = !(vga_si->flags & VIDEO_FLAGS_NOCURSOR); } static void vgacon_deinit(struct vc_data *c) { /* When closing the active console, reset video origin */ if (con_is_visible(c)) { c->vc_visible_origin = vga_vram_base; vga_set_mem_top(c); } if (!--vgacon_refcount) con_free_unimap(c); c->uni_pagedict_loc = &c->uni_pagedict; con_set_default_unimap(c); } static u8 vgacon_build_attr(struct vc_data *c, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic) { u8 attr = color; if (vga_can_do_color) { if (italic) attr = (attr & 0xF0) | c->vc_itcolor; else if (underline) attr = (attr & 0xf0) | c->vc_ulcolor; else if (intensity == VCI_HALF_BRIGHT) attr = (attr & 0xf0) | c->vc_halfcolor; } if (reverse) attr = ((attr) & 0x88) | ((((attr) >> 4) | ((attr) << 4)) & 0x77); if (blink) attr ^= 0x80; if (intensity == VCI_BOLD) attr ^= 0x08; if (!vga_can_do_color) { if (italic) attr = (attr & 0xF8) | 0x02; else if (underline) attr = (attr & 0xf8) | 0x01; else if (intensity == VCI_HALF_BRIGHT) attr = (attr & 0xf0) | 0x08; } return attr; } static void vgacon_invert_region(struct vc_data *c, u16 * p, int count) { const bool col = vga_can_do_color; while (count--) { u16 a = scr_readw(p); if (col) a = ((a) & 0x88ff) | (((a) & 0x7000) >> 4) | (((a) & 0x0700) << 4); else a ^= ((a & 0x0700) == 0x0100) ? 0x7000 : 0x7700; scr_writew(a, p++); } } static void vgacon_set_cursor_size(int from, int to) { unsigned long flags; int curs, cure; if ((from == cursor_size_lastfrom) && (to == cursor_size_lastto)) return; cursor_size_lastfrom = from; cursor_size_lastto = to; raw_spin_lock_irqsave(&vga_lock, flags); if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_CURSOR_START, vga_video_port_reg); curs = inb_p(vga_video_port_val); outb_p(VGA_CRTC_CURSOR_END, vga_video_port_reg); cure = inb_p(vga_video_port_val); } else { curs = 0; cure = 0; } curs = (curs & 0xc0) | from; cure = (cure & 0xe0) | to; outb_p(VGA_CRTC_CURSOR_START, vga_video_port_reg); outb_p(curs, vga_video_port_val); outb_p(VGA_CRTC_CURSOR_END, vga_video_port_reg); outb_p(cure, vga_video_port_val); raw_spin_unlock_irqrestore(&vga_lock, flags); } static void vgacon_cursor(struct vc_data *c, bool enable) { unsigned int c_height; if (c->vc_mode != KD_TEXT) return; vgacon_restore_screen(c); c_height = c->vc_cell_height; write_vga(14, (c->vc_pos - vga_vram_base) / 2); if (!enable) { if (vga_video_type >= VIDEO_TYPE_VGAC) vgacon_set_cursor_size(31, 30); else vgacon_set_cursor_size(31, 31); return; } switch (CUR_SIZE(c->vc_cursor_type)) { case CUR_UNDERLINE: vgacon_set_cursor_size(c_height - (c_height < 10 ? 2 : 3), c_height - (c_height < 10 ? 1 : 2)); break; case CUR_TWO_THIRDS: vgacon_set_cursor_size(c_height / 3, c_height - (c_height < 10 ? 1 : 2)); break; case CUR_LOWER_THIRD: vgacon_set_cursor_size(c_height * 2 / 3, c_height - (c_height < 10 ? 1 : 2)); break; case CUR_LOWER_HALF: vgacon_set_cursor_size(c_height / 2, c_height - (c_height < 10 ? 1 : 2)); break; case CUR_NONE: if (vga_video_type >= VIDEO_TYPE_VGAC) vgacon_set_cursor_size(31, 30); else vgacon_set_cursor_size(31, 31); break; default: vgacon_set_cursor_size(1, c_height); break; } } static void vgacon_doresize(struct vc_data *c, unsigned int width, unsigned int height) { unsigned long flags; unsigned int scanlines = height * c->vc_cell_height; u8 scanlines_lo = 0, r7 = 0, vsync_end = 0, mode, max_scan; raw_spin_lock_irqsave(&vga_lock, flags); vgacon_xres = width * VGA_FONTWIDTH; vgacon_yres = height * c->vc_cell_height; if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_MAX_SCAN, vga_video_port_reg); max_scan = inb_p(vga_video_port_val); if (max_scan & 0x80) scanlines <<= 1; outb_p(VGA_CRTC_MODE, vga_video_port_reg); mode = inb_p(vga_video_port_val); if (mode & 0x04) scanlines >>= 1; scanlines -= 1; scanlines_lo = scanlines & 0xff; outb_p(VGA_CRTC_OVERFLOW, vga_video_port_reg); r7 = inb_p(vga_video_port_val) & ~0x42; if (scanlines & 0x100) r7 |= 0x02; if (scanlines & 0x200) r7 |= 0x40; /* deprotect registers */ outb_p(VGA_CRTC_V_SYNC_END, vga_video_port_reg); vsync_end = inb_p(vga_video_port_val); outb_p(VGA_CRTC_V_SYNC_END, vga_video_port_reg); outb_p(vsync_end & ~0x80, vga_video_port_val); } outb_p(VGA_CRTC_H_DISP, vga_video_port_reg); outb_p(width - 1, vga_video_port_val); outb_p(VGA_CRTC_OFFSET, vga_video_port_reg); outb_p(width >> 1, vga_video_port_val); if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_V_DISP_END, vga_video_port_reg); outb_p(scanlines_lo, vga_video_port_val); outb_p(VGA_CRTC_OVERFLOW, vga_video_port_reg); outb_p(r7,vga_video_port_val); /* reprotect registers */ outb_p(VGA_CRTC_V_SYNC_END, vga_video_port_reg); outb_p(vsync_end, vga_video_port_val); } raw_spin_unlock_irqrestore(&vga_lock, flags); } static bool vgacon_switch(struct vc_data *c) { int x = c->vc_cols * VGA_FONTWIDTH; int y = c->vc_rows * c->vc_cell_height; int rows = vga_si->orig_video_lines * vga_default_font_height/ c->vc_cell_height; /* * We need to save screen size here as it's the only way * we can spot the screen has been resized and we need to * set size of freshly allocated screens ourselves. */ vga_video_num_columns = c->vc_cols; vga_video_num_lines = c->vc_rows; /* We can only copy out the size of the video buffer here, * otherwise we get into VGA BIOS */ if (!vga_is_gfx) { scr_memcpyw((u16 *) c->vc_origin, (u16 *) c->vc_screenbuf, c->vc_screenbuf_size > vga_vram_size ? vga_vram_size : c->vc_screenbuf_size); if ((vgacon_xres != x || vgacon_yres != y) && (!(vga_video_num_columns % 2) && vga_video_num_columns <= vga_si->orig_video_cols && vga_video_num_lines <= rows)) vgacon_doresize(c, c->vc_cols, c->vc_rows); } return false; /* Redrawing not needed */ } static void vga_set_palette(struct vc_data *vc, const unsigned char *table) { int i, j; vga_w(vgastate.vgabase, VGA_PEL_MSK, 0xff); for (i = j = 0; i < 16; i++) { vga_w(vgastate.vgabase, VGA_PEL_IW, table[i]); vga_w(vgastate.vgabase, VGA_PEL_D, vc->vc_palette[j++] >> 2); vga_w(vgastate.vgabase, VGA_PEL_D, vc->vc_palette[j++] >> 2); vga_w(vgastate.vgabase, VGA_PEL_D, vc->vc_palette[j++] >> 2); } } static void vgacon_set_palette(struct vc_data *vc, const unsigned char *table) { if (vga_video_type != VIDEO_TYPE_VGAC || vga_palette_blanked || !con_is_visible(vc)) return; vga_set_palette(vc, table); } /* structure holding original VGA register settings */ static struct { unsigned char SeqCtrlIndex; /* Sequencer Index reg. */ unsigned char CrtCtrlIndex; /* CRT-Contr. Index reg. */ unsigned char CrtMiscIO; /* Miscellaneous register */ unsigned char HorizontalTotal; /* CRT-Controller:00h */ unsigned char HorizDisplayEnd; /* CRT-Controller:01h */ unsigned char StartHorizRetrace; /* CRT-Controller:04h */ unsigned char EndHorizRetrace; /* CRT-Controller:05h */ unsigned char Overflow; /* CRT-Controller:07h */ unsigned char StartVertRetrace; /* CRT-Controller:10h */ unsigned char EndVertRetrace; /* CRT-Controller:11h */ unsigned char ModeControl; /* CRT-Controller:17h */ unsigned char ClockingMode; /* Seq-Controller:01h */ } vga_state; static void vga_vesa_blank(struct vgastate *state, enum vesa_blank_mode mode) { /* save original values of VGA controller registers */ if (!vga_vesa_blanked) { raw_spin_lock_irq(&vga_lock); vga_state.SeqCtrlIndex = vga_r(state->vgabase, VGA_SEQ_I); vga_state.CrtCtrlIndex = inb_p(vga_video_port_reg); vga_state.CrtMiscIO = vga_r(state->vgabase, VGA_MIS_R); raw_spin_unlock_irq(&vga_lock); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ vga_state.HorizontalTotal = inb_p(vga_video_port_val); outb_p(0x01, vga_video_port_reg); /* HorizDisplayEnd */ vga_state.HorizDisplayEnd = inb_p(vga_video_port_val); outb_p(0x04, vga_video_port_reg); /* StartHorizRetrace */ vga_state.StartHorizRetrace = inb_p(vga_video_port_val); outb_p(0x05, vga_video_port_reg); /* EndHorizRetrace */ vga_state.EndHorizRetrace = inb_p(vga_video_port_val); outb_p(0x07, vga_video_port_reg); /* Overflow */ vga_state.Overflow = inb_p(vga_video_port_val); outb_p(0x10, vga_video_port_reg); /* StartVertRetrace */ vga_state.StartVertRetrace = inb_p(vga_video_port_val); outb_p(0x11, vga_video_port_reg); /* EndVertRetrace */ vga_state.EndVertRetrace = inb_p(vga_video_port_val); outb_p(0x17, vga_video_port_reg); /* ModeControl */ vga_state.ModeControl = inb_p(vga_video_port_val); vga_state.ClockingMode = vga_rseq(state->vgabase, VGA_SEQ_CLOCK_MODE); } /* assure that video is enabled */ /* "0x20" is VIDEO_ENABLE_bit in register 01 of sequencer */ raw_spin_lock_irq(&vga_lock); vga_wseq(state->vgabase, VGA_SEQ_CLOCK_MODE, vga_state.ClockingMode | 0x20); /* test for vertical retrace in process.... */ if ((vga_state.CrtMiscIO & 0x80) == 0x80) vga_w(state->vgabase, VGA_MIS_W, vga_state.CrtMiscIO & 0xEF); /* * Set <End of vertical retrace> to minimum (0) and * <Start of vertical Retrace> to maximum (incl. overflow) * Result: turn off vertical sync (VSync) pulse. */ if (mode & VESA_VSYNC_SUSPEND) { outb_p(0x10, vga_video_port_reg); /* StartVertRetrace */ outb_p(0xff, vga_video_port_val); /* maximum value */ outb_p(0x11, vga_video_port_reg); /* EndVertRetrace */ outb_p(0x40, vga_video_port_val); /* minimum (bits 0..3) */ outb_p(0x07, vga_video_port_reg); /* Overflow */ outb_p(vga_state.Overflow | 0x84, vga_video_port_val); /* bits 9,10 of vert. retrace */ } if (mode & VESA_HSYNC_SUSPEND) { /* * Set <End of horizontal retrace> to minimum (0) and * <Start of horizontal Retrace> to maximum * Result: turn off horizontal sync (HSync) pulse. */ outb_p(0x04, vga_video_port_reg); /* StartHorizRetrace */ outb_p(0xff, vga_video_port_val); /* maximum */ outb_p(0x05, vga_video_port_reg); /* EndHorizRetrace */ outb_p(0x00, vga_video_port_val); /* minimum (0) */ } /* restore both index registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); raw_spin_unlock_irq(&vga_lock); } static void vga_vesa_unblank(struct vgastate *state) { /* restore original values of VGA controller registers */ raw_spin_lock_irq(&vga_lock); vga_w(state->vgabase, VGA_MIS_W, vga_state.CrtMiscIO); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ outb_p(vga_state.HorizontalTotal, vga_video_port_val); outb_p(0x01, vga_video_port_reg); /* HorizDisplayEnd */ outb_p(vga_state.HorizDisplayEnd, vga_video_port_val); outb_p(0x04, vga_video_port_reg); /* StartHorizRetrace */ outb_p(vga_state.StartHorizRetrace, vga_video_port_val); outb_p(0x05, vga_video_port_reg); /* EndHorizRetrace */ outb_p(vga_state.EndHorizRetrace, vga_video_port_val); outb_p(0x07, vga_video_port_reg); /* Overflow */ outb_p(vga_state.Overflow, vga_video_port_val); outb_p(0x10, vga_video_port_reg); /* StartVertRetrace */ outb_p(vga_state.StartVertRetrace, vga_video_port_val); outb_p(0x11, vga_video_port_reg); /* EndVertRetrace */ outb_p(vga_state.EndVertRetrace, vga_video_port_val); outb_p(0x17, vga_video_port_reg); /* ModeControl */ outb_p(vga_state.ModeControl, vga_video_port_val); /* ClockingMode */ vga_wseq(state->vgabase, VGA_SEQ_CLOCK_MODE, vga_state.ClockingMode); /* restore index/control registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); raw_spin_unlock_irq(&vga_lock); } static void vga_pal_blank(struct vgastate *state) { int i; vga_w(state->vgabase, VGA_PEL_MSK, 0xff); for (i = 0; i < 16; i++) { vga_w(state->vgabase, VGA_PEL_IW, i); vga_w(state->vgabase, VGA_PEL_D, 0); vga_w(state->vgabase, VGA_PEL_D, 0); vga_w(state->vgabase, VGA_PEL_D, 0); } } static bool vgacon_blank(struct vc_data *c, enum vesa_blank_mode blank, bool mode_switch) { switch (blank) { case VESA_NO_BLANKING: /* Unblank */ if (vga_vesa_blanked) { vga_vesa_unblank(&vgastate); vga_vesa_blanked = VESA_NO_BLANKING; } if (vga_palette_blanked) { vga_set_palette(c, color_table); vga_palette_blanked = false; return 0; } vga_is_gfx = false; /* Tell console.c that it has to restore the screen itself */ return 1; case VESA_VSYNC_SUSPEND: /* Normal blanking */ if (!mode_switch && vga_video_type == VIDEO_TYPE_VGAC) { vga_pal_blank(&vgastate); vga_palette_blanked = true; return 0; } vgacon_set_origin(c); scr_memsetw((void *) vga_vram_base, BLANK, c->vc_screenbuf_size); if (mode_switch) vga_is_gfx = true; return 1; default: /* VESA blanking */ if (vga_video_type == VIDEO_TYPE_VGAC) { vga_vesa_blank(&vgastate, blank - 1); vga_vesa_blanked = blank; } return 0; } } /* * PIO_FONT support. * * The font loading code goes back to the codepage package by * Joel Hoffman (joel@wam.umd.edu). (He reports that the original * reference is: "From: p. 307 of _Programmer's Guide to PC & PS/2 * Video Systems_ by Richard Wilton. 1987. Microsoft Press".) * * Change for certain monochrome monitors by Yury Shevchuck * (sizif@botik.yaroslavl.su). */ #define colourmap 0xa0000 /* Pauline Middelink <middelin@polyware.iaf.nl> reports that we should use 0xA0000 for the bwmap as well.. */ #define blackwmap 0xa0000 #define cmapsz 8192 static int vgacon_do_font_op(struct vgastate *state, char *arg, int set, bool ch512) { unsigned short video_port_status = vga_video_port_reg + 6; int font_select = 0x00, beg, i; char *charmap; bool clear_attribs = false; if (vga_video_type != VIDEO_TYPE_EGAM) { charmap = (char *) VGA_MAP_MEM(colourmap, 0); beg = 0x0e; } else { charmap = (char *) VGA_MAP_MEM(blackwmap, 0); beg = 0x0a; } /* * All fonts are loaded in slot 0 (0:1 for 512 ch) */ if (!arg) return -EINVAL; /* Return to default font not supported */ font_select = ch512 ? 0x04 : 0x00; raw_spin_lock_irq(&vga_lock); /* First, the Sequencer */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x1); /* CPU writes only to map 2 */ vga_wseq(state->vgabase, VGA_SEQ_PLANE_WRITE, 0x04); /* Sequential addressing */ vga_wseq(state->vgabase, VGA_SEQ_MEMORY_MODE, 0x07); /* Clear synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x03); /* Now, the graphics controller, select map 2 */ vga_wgfx(state->vgabase, VGA_GFX_PLANE_READ, 0x02); /* disable odd-even addressing */ vga_wgfx(state->vgabase, VGA_GFX_MODE, 0x00); /* map start at A000:0000 */ vga_wgfx(state->vgabase, VGA_GFX_MISC, 0x00); raw_spin_unlock_irq(&vga_lock); if (arg) { if (set) for (i = 0; i < cmapsz; i++) { vga_writeb(arg[i], charmap + i); cond_resched(); } else for (i = 0; i < cmapsz; i++) { arg[i] = vga_readb(charmap + i); cond_resched(); } /* * In 512-character mode, the character map is not contiguous if * we want to remain EGA compatible -- which we do */ if (ch512) { charmap += 2 * cmapsz; arg += cmapsz; if (set) for (i = 0; i < cmapsz; i++) { vga_writeb(arg[i], charmap + i); cond_resched(); } else for (i = 0; i < cmapsz; i++) { arg[i] = vga_readb(charmap + i); cond_resched(); } } } raw_spin_lock_irq(&vga_lock); /* First, the sequencer, Synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x01); /* CPU writes to maps 0 and 1 */ vga_wseq(state->vgabase, VGA_SEQ_PLANE_WRITE, 0x03); /* odd-even addressing */ vga_wseq(state->vgabase, VGA_SEQ_MEMORY_MODE, 0x03); /* Character Map Select */ if (set) vga_wseq(state->vgabase, VGA_SEQ_CHARACTER_MAP, font_select); /* clear synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x03); /* Now, the graphics controller, select map 0 for CPU */ vga_wgfx(state->vgabase, VGA_GFX_PLANE_READ, 0x00); /* enable even-odd addressing */ vga_wgfx(state->vgabase, VGA_GFX_MODE, 0x10); /* map starts at b800:0 or b000:0 */ vga_wgfx(state->vgabase, VGA_GFX_MISC, beg); /* if 512 char mode is already enabled don't re-enable it. */ if ((set) && (ch512 != vga_512_chars)) { vga_512_chars = ch512; /* 256-char: enable intensity bit 512-char: disable intensity bit */ inb_p(video_port_status); /* clear address flip-flop */ /* color plane enable register */ vga_wattr(state->vgabase, VGA_ATC_PLANE_ENABLE, ch512 ? 0x07 : 0x0f); /* Wilton (1987) mentions the following; I don't know what it means, but it works, and it appears necessary */ inb_p(video_port_status); vga_wattr(state->vgabase, VGA_AR_ENABLE_DISPLAY, 0); clear_attribs = true; } raw_spin_unlock_irq(&vga_lock); if (clear_attribs) { for (i = 0; i < MAX_NR_CONSOLES; i++) { struct vc_data *c = vc_cons[i].d; if (c && c->vc_sw == &vga_con) { /* force hi font mask to 0, so we always clear the bit on either transition */ c->vc_hi_font_mask = 0x00; clear_buffer_attributes(c); c->vc_hi_font_mask = ch512 ? 0x0800 : 0; } } } return 0; } /* * Adjust the screen to fit a font of a certain height */ static int vgacon_adjust_height(struct vc_data *vc, unsigned fontheight) { unsigned char ovr, vde, fsr; int rows, maxscan, i; rows = vc->vc_scan_lines / fontheight; /* Number of video rows we end up with */ maxscan = rows * fontheight - 1; /* Scan lines to actually display-1 */ /* Reprogram the CRTC for the new font size Note: the attempt to read the overflow register will fail on an EGA, but using 0xff for the previous value appears to be OK for EGA text modes in the range 257-512 scan lines, so I guess we don't need to worry about it. The same applies for the spill bits in the font size and cursor registers; they are write-only on EGA, but it appears that they are all don't care bits on EGA, so I guess it doesn't matter. */ raw_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ ovr = inb_p(vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size register */ fsr = inb_p(vga_video_port_val); raw_spin_unlock_irq(&vga_lock); vde = maxscan & 0xff; /* Vertical display end reg */ ovr = (ovr & 0xbd) + /* Overflow register */ ((maxscan & 0x100) >> 7) + ((maxscan & 0x200) >> 3); fsr = (fsr & 0xe0) + (fontheight - 1); /* Font size register */ raw_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ outb_p(ovr, vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size */ outb_p(fsr, vga_video_port_val); outb_p(0x12, vga_video_port_reg); /* Vertical display limit */ outb_p(vde, vga_video_port_val); raw_spin_unlock_irq(&vga_lock); vga_video_font_height = fontheight; for (i = 0; i < MAX_NR_CONSOLES; i++) { struct vc_data *c = vc_cons[i].d; if (c && c->vc_sw == &vga_con) { if (con_is_visible(c)) { /* void size to cause regs to be rewritten */ cursor_size_lastfrom = 0; cursor_size_lastto = 0; c->vc_sw->con_cursor(c, true); } c->vc_font.height = c->vc_cell_height = fontheight; vc_resize(c, 0, rows); /* Adjust console size */ } } return 0; } static int vgacon_font_set(struct vc_data *c, const struct console_font *font, unsigned int vpitch, unsigned int flags) { unsigned charcount = font->charcount; int rc; if (vga_video_type < VIDEO_TYPE_EGAM) return -EINVAL; if (font->width != VGA_FONTWIDTH || font->height > 32 || vpitch != 32 || (charcount != 256 && charcount != 512)) return -EINVAL; rc = vgacon_do_font_op(&vgastate, font->data, 1, charcount == 512); if (rc) return rc; if (!(flags & KD_FONT_FLAG_DONT_RECALC)) rc = vgacon_adjust_height(c, font->height); return rc; } static int vgacon_font_get(struct vc_data *c, struct console_font *font, unsigned int vpitch) { if (vga_video_type < VIDEO_TYPE_EGAM || vpitch != 32) return -EINVAL; font->width = VGA_FONTWIDTH; font->height = c->vc_font.height; font->charcount = vga_512_chars ? 512 : 256; if (!font->data) return 0; return vgacon_do_font_op(&vgastate, font->data, 0, vga_512_chars); } static int vgacon_resize(struct vc_data *c, unsigned int width, unsigned int height, bool from_user) { if ((width << 1) * height > vga_vram_size) return -EINVAL; if (from_user) { /* * Ho ho! Someone (svgatextmode, eh?) may have reprogrammed * the video mode! Set the new defaults then and go away. */ vga_si->orig_video_cols = width; vga_si->orig_video_lines = height; vga_default_font_height = c->vc_cell_height; return 0; } if (width % 2 || width > vga_si->orig_video_cols || height > (vga_si->orig_video_lines * vga_default_font_height)/ c->vc_cell_height) return -EINVAL; if (con_is_visible(c) && !vga_is_gfx) /* who knows */ vgacon_doresize(c, width, height); return 0; } static bool vgacon_set_origin(struct vc_data *c) { if (vga_is_gfx || /* We don't play origin tricks in graphic modes */ (console_blanked && !vga_palette_blanked)) /* Nor we write to blanked screens */ return false; c->vc_origin = c->vc_visible_origin = vga_vram_base; vga_set_mem_top(c); vga_rolled_over = 0; return true; } static void vgacon_save_screen(struct vc_data *c) { static int vga_bootup_console = 0; if (!vga_bootup_console) { /* This is a gross hack, but here is the only place we can * set bootup console parameters without messing up generic * console initialization routines. */ vga_bootup_console = 1; c->state.x = vga_si->orig_x; c->state.y = vga_si->orig_y; } /* We can't copy in more than the size of the video buffer, * or we'll be copying in VGA BIOS */ if (!vga_is_gfx) scr_memcpyw((u16 *) c->vc_screenbuf, (u16 *) c->vc_origin, c->vc_screenbuf_size > vga_vram_size ? vga_vram_size : c->vc_screenbuf_size); } static bool vgacon_scroll(struct vc_data *c, unsigned int t, unsigned int b, enum con_scroll dir, unsigned int lines) { unsigned long oldo; unsigned int delta; if (t || b != c->vc_rows || vga_is_gfx || c->vc_mode != KD_TEXT) return false; if (!vga_hardscroll_enabled || lines >= c->vc_rows / 2) return false; vgacon_restore_screen(c); oldo = c->vc_origin; delta = lines * c->vc_size_row; if (dir == SM_UP) { if (c->vc_scr_end + delta >= vga_vram_end) { scr_memcpyw((u16 *) vga_vram_base, (u16 *) (oldo + delta), c->vc_screenbuf_size - delta); c->vc_origin = vga_vram_base; vga_rolled_over = oldo - vga_vram_base; } else c->vc_origin += delta; scr_memsetw((u16 *) (c->vc_origin + c->vc_screenbuf_size - delta), c->vc_video_erase_char, delta); } else { if (oldo - delta < vga_vram_base) { scr_memmovew((u16 *) (vga_vram_end - c->vc_screenbuf_size + delta), (u16 *) oldo, c->vc_screenbuf_size - delta); c->vc_origin = vga_vram_end - c->vc_screenbuf_size; vga_rolled_over = 0; } else c->vc_origin -= delta; c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size; scr_memsetw((u16 *) (c->vc_origin), c->vc_video_erase_char, delta); } c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size; c->vc_visible_origin = c->vc_origin; vga_set_mem_top(c); c->vc_pos = (c->vc_pos - oldo) + c->vc_origin; return true; } /* * The console `switch' structure for the VGA based console */ static void vgacon_clear(struct vc_data *vc, unsigned int sy, unsigned int sx, unsigned int width) { } static void vgacon_putcs(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos) { } const struct consw vga_con = { .owner = THIS_MODULE, .con_startup = vgacon_startup, .con_init = vgacon_init, .con_deinit = vgacon_deinit, .con_clear = vgacon_clear, .con_putcs = vgacon_putcs, .con_cursor = vgacon_cursor, .con_scroll = vgacon_scroll, .con_switch = vgacon_switch, .con_blank = vgacon_blank, .con_font_set = vgacon_font_set, .con_font_get = vgacon_font_get, .con_resize = vgacon_resize, .con_set_palette = vgacon_set_palette, .con_scrolldelta = vgacon_scrolldelta, .con_set_origin = vgacon_set_origin, .con_save_screen = vgacon_save_screen, .con_build_attr = vgacon_build_attr, .con_invert_region = vgacon_invert_region, }; EXPORT_SYMBOL(vga_con); void vgacon_register_screen(struct screen_info *si) { if (!si || vga_si) return; conswitchp = &vga_con; vga_si = si; } MODULE_DESCRIPTION("VGA based console driver"); MODULE_LICENSE("GPL");
8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #include <linux/etherdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <net/net_namespace.h> #include <linux/module.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; #define VPORT_HASH_BUCKETS 1024 /** * ovs_vport_init - initialize vport subsystem * * Called at module load time to initialize the vport subsystem. */ int ovs_vport_init(void) { dev_table = kcalloc(VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM; return 0; } /** * ovs_vport_exit - shutdown vport subsystem * * Called at module exit time to shutdown the vport subsystem. */ void ovs_vport_exit(void) { kfree(dev_table); } static struct hlist_head *hash_bucket(const struct net *net, const char *name) { unsigned int hash = jhash(name, strlen(name), (unsigned long) net); return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; ovs_lock(); list_for_each_entry(o, &vport_ops_list, list) if (ops->type == o->type) goto errout; list_add_tail(&ops->list, &vport_ops_list); err = 0; errout: ovs_unlock(); return err; } EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { ovs_lock(); list_del(&ops->list); ovs_unlock(); } EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); /** * ovs_vport_locate - find a port that has already been created * * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. */ struct vport *ovs_vport_locate(const struct net *net, const char *name) { struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node, lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; return NULL; } /** * ovs_vport_alloc - allocate and initialize new vport * * @priv_size: Size of private data area to allocate. * @ops: vport device ops * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using * vport_priv(). Some parameters of the vport will be initialized from @parms. * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; int err; alloc_size = sizeof(struct vport); if (priv_size) { alloc_size = ALIGN(alloc_size, VPORT_ALIGN); alloc_size += priv_size; } vport = kzalloc(alloc_size, GFP_KERNEL); if (!vport) return ERR_PTR(-ENOMEM); vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; goto err_kfree_vport; } vport->dp = parms->dp; vport->port_no = parms->port_no; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { err = -EINVAL; goto err_free_percpu; } return vport; err_free_percpu: free_percpu(vport->upcall_stats); err_kfree_vport: kfree(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_vport_alloc); /** * ovs_vport_free - uninitialize and free vport * * @vport: vport to free * * Frees a vport allocated with vport_alloc() when it is no longer needed. * * The caller must ensure that an RCU grace period has passed since the last * time @vport was in a datapath. */ void ovs_vport_free(struct vport *vport) { /* vport is freed from RCU callback or error path, Therefore * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); free_percpu(vport->upcall_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) { struct vport_ops *ops; list_for_each_entry(ops, &vport_ops_list, list) if (ops->type == parms->type) return ops; return NULL; } /** * ovs_vport_add - add vport device (for kernel callers) * * @parms: Information about new vport. * * Creates a new vport with the specified configuration (which is dependent on * device type). ovs_mutex must be held. */ struct vport *ovs_vport_add(const struct vport_parms *parms) { struct vport_ops *ops; struct vport *vport; ops = ovs_vport_lookup(parms); if (ops) { struct hlist_head *bucket; if (!try_module_get(ops->owner)) return ERR_PTR(-EAFNOSUPPORT); vport = ops->create(parms); if (IS_ERR(vport)) { module_put(ops->owner); return vport; } bucket = hash_bucket(ovs_dp_get_net(vport->dp), ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } /* Unlock to attempt module load and return -EAGAIN if load * was successful as we need to restart the port addition * workflow. */ ovs_unlock(); request_module("vport-type-%d", parms->type); ovs_lock(); if (!ovs_vport_lookup(parms)) return ERR_PTR(-EAFNOSUPPORT); else return ERR_PTR(-EAGAIN); } /** * ovs_vport_set_options - modify existing vport device (for kernel callers) * * @vport: vport to modify. * @options: New configuration. * * Modifies an existing device with the specified configuration (which is * dependent on device type). ovs_mutex must be held. */ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) { if (!vport->ops->set_options) return -EOPNOTSUPP; return vport->ops->set_options(vport, options); } /** * ovs_vport_del - delete existing vport device * * @vport: vport to delete. * * Detaches @vport from its datapath and destroys it. ovs_mutex must * be held. */ void ovs_vport_del(struct vport *vport) { hlist_del_rcu(&vport->hash_node); module_put(vport->ops->owner); vport->ops->destroy(vport); } /** * ovs_vport_get_stats - retrieve device stats * * @vport: vport from which to retrieve the stats * @stats: location to store stats * * Retrieves transmit, receive, and error stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { const struct rtnl_link_stats64 *dev_stats; struct rtnl_link_stats64 temp; dev_stats = dev_get_stats(vport->dev, &temp); stats->rx_errors = dev_stats->rx_errors; stats->tx_errors = dev_stats->tx_errors; stats->tx_dropped = dev_stats->tx_dropped; stats->rx_dropped = dev_stats->rx_dropped; stats->rx_bytes = dev_stats->rx_bytes; stats->rx_packets = dev_stats->rx_packets; stats->tx_bytes = dev_stats->tx_bytes; stats->tx_packets = dev_stats->tx_packets; } /** * ovs_vport_get_upcall_stats - retrieve upcall stats * * @vport: vport from which to retrieve the stats. * @skb: sk_buff where upcall stats should be appended. * * Retrieves upcall stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; int i; __u64 tx_success = 0; __u64 tx_fail = 0; for_each_possible_cpu(i) { const struct vport_upcall_stats_percpu *stats; unsigned int start; stats = per_cpu_ptr(vport->upcall_stats, i); do { start = u64_stats_fetch_begin(&stats->syncp); tx_success += u64_stats_read(&stats->n_success); tx_fail += u64_stats_read(&stats->n_fail); } while (u64_stats_fetch_retry(&stats->syncp, start)); } nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); if (!nla) return -EMSGSIZE; if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_get_options - retrieve device options * * @vport: vport from which to retrieve the options. * @skb: sk_buff where options should be appended. * * Retrieves the configuration of the given device, appending an * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested * vport-specific attributes to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another * negative error code if a real error occurred. If an error occurs, @skb is * left unmodified. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; int err; if (!vport->ops->get_options) return 0; nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_OPTIONS); if (!nla) return -EMSGSIZE; err = vport->ops->get_options(vport, skb); if (err) { nla_nest_cancel(skb, nla); return err; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_set_upcall_portids - set upcall portids of @vport. * * @vport: vport to modify. * @ids: new configuration, an array of port ids. * * Sets the vport's upcall_portids to @ids. * * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed * as an array of U32. * * Must be called with ovs_mutex. */ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) { struct vport_portids *old, *vport_portids; if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; old = ovsl_dereference(vport->upcall_portids); vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), GFP_KERNEL); if (!vport_portids) return -ENOMEM; vport_portids->n_ids = nla_len(ids) / sizeof(u32); vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); nla_memcpy(vport_portids->ids, ids, nla_len(ids)); rcu_assign_pointer(vport->upcall_portids, vport_portids); if (old) kfree_rcu(old, rcu); return 0; } /** * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. * * @vport: vport from which to retrieve the portids. * @skb: sk_buff where portids should be appended. * * Retrieves the configuration of the given vport, appending the * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall * portids to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. * If an error occurs, @skb is left unmodified. Must be called with * ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_portids(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; ids = rcu_dereference_ovsl(vport->upcall_portids); if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->n_ids * sizeof(u32), (void *)ids->ids); else return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); } /** * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. * * @vport: vport from which the missed packet is received. * @skb: skb that the missed packet was received. * * Uses the skb_get_hash() to select the upcall portid to send the * upcall. * * Returns the portid of the target socket. Must be called with rcu_read_lock. */ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; u32 ids_index; u32 hash; ids = rcu_dereference(vport->upcall_portids); /* If there is only one portid, select it in the fast-path. */ if (ids->n_ids == 1) return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); return ids->ids[ids_index]; } /** * ovs_vport_receive - pass up received packet to the datapath for processing * * @vport: vport that received the packet * @skb: skb that was received * @tun_info: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, const struct ip_tunnel_info *tun_info) { struct sw_flow_key key; int error; OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; OVS_CB(skb)->cutlen = 0; OVS_CB(skb)->probability = 0; OVS_CB(skb)->upcall_pid = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; mark = skb->mark; skb_scrub_packet(skb, true); skb->mark = mark; tun_info = NULL; } /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return error; } ovs_dp_process_packet(skb, &key); return 0; } static int packet_length(const struct sk_buff *skb, struct net_device *dev) { int length = skb->len - dev->hard_header_len; if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) length -= VLAN_HLEN; /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none * account for 802.1ad. e.g. is_skb_forwardable(). */ return length > 0 ? length : 0; } void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) { int mtu = vport->dev->mtu; switch (vport->dev->type) { case ARPHRD_NONE: if (mac_proto == MAC_PROTO_ETHERNET) { skb_reset_network_header(skb); skb_reset_mac_len(skb); skb->protocol = htons(ETH_P_TEB); } else if (mac_proto != MAC_PROTO_NONE) { WARN_ON_ONCE(1); goto drop; } break; case ARPHRD_ETHER: if (mac_proto != MAC_PROTO_ETHERNET) goto drop; break; default: goto drop; } if (unlikely(packet_length(skb, vport->dev) > mtu && !skb_is_gso(skb))) { vport->dev->stats.tx_errors++; if (vport->dev->flags & IFF_UP) net_warn_ratelimited("%s: dropped over-mtu packet: " "%d > %d\n", vport->dev->name, packet_length(skb, vport->dev), mtu); goto drop; } skb->dev = vport->dev; skb_clear_tstamp(skb); vport->ops->send(skb); return; drop: kfree_skb(skb); }
68 68 3 3 3 3 3 3 2 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 4 3 1 9 7 2 1 2 2 2 2 5 3 2 1 2 1 1 1 1 4 3 1 1 1 2 15 11 5 1 1 1 1 17 10 6 3 2 1 2 1 2 1 1 2 2 2 2 2 2 4 2 5 2 2 2 1 85 153 6 2 10 85 144 1 18 8 8 7 41 3 1 1 1 36 35 1 1 35 2 1 1 1 2 1 1 12 6 3 1 2 9 8 1 1 13 7 2 2 1 1 1 3 2 1 5 1 1 2 1 12 1 3 1 5 2 11 3 2 4 2 6 5 1 8 6 2 5 4 1 7 7 57 57 68 61 61 57 57 7 7 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 /* * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/pid.h> #include <linux/pid_namespace.h> #include <linux/mutex.h> #include <net/netlink.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" #include "cma_priv.h" #include "restrack.h" #include "uverbs.h" /* * This determines whether a non-privileged user is allowed to specify a * controlled QKEY or not, when true non-privileged user is allowed to specify * a controlled QKEY. */ static bool privileged_qkey; typedef int (*res_fill_func_t)(struct sk_buff*, bool, struct rdma_restrack_entry*, uint32_t); /* * Sort array elements by the netlink attribute name */ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CTX] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CTX_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SRQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_MIN_RANGE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_MAX_RANGE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type) { if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name)) return -EMSGSIZE; if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC && nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type)) return -EMSGSIZE; return 0; } static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type, u32 value) { if (put_driver_name_print_type(msg, name, print_type)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value)) return -EMSGSIZE; return 0; } static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type, u64 value) { if (put_driver_name_print_type(msg, name, print_type)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; return 0; } int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name, const char *str) { if (put_driver_name_print_type(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, str)) return -EMSGSIZE; return 0; } EXPORT_SYMBOL(rdma_nl_put_driver_string); int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value) { return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u32); int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, u32 value) { return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex); int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value) { return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u64); int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value) { return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); bool rdma_nl_get_privileged_qkey(void) { return privileged_qkey; } EXPORT_SYMBOL(rdma_nl_get_privileged_qkey); static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, dev_name(&device->dev))) return -EMSGSIZE; return 0; } static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) { char fw[IB_FW_VERSION_NAME_MAX]; int ret = 0; u32 port; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) return -EMSGSIZE; BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, device->attrs.device_cap_flags, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; ib_get_device_fw_str(device, fw); /* Device without FW has strlen(fw) = 0 */ if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, be64_to_cpu(device->node_guid), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, be64_to_cpu(device->attrs.sys_image_guid), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) return -EMSGSIZE; if (device->type && nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type)) return -EMSGSIZE; if (device->parent && nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME, dev_name(&device->parent->dev))) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, device->name_assign_type)) return -EMSGSIZE; /* * Link type is determined on first port and mlx4 device * which can potentially have two different link type for the same * IB device is considered as better to be avoided in the future, */ port = rdma_start_port(device); if (rdma_cap_opa_mad(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa"); else if (rdma_protocol_ib(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib"); else if (rdma_protocol_iwarp(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw"); else if (rdma_protocol_roce(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce"); else if (rdma_protocol_usnic(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "usnic"); return ret; } static int fill_port_info(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = NULL; struct ib_port_attr attr; int ret; u64 cap_flags = 0; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) return -EMSGSIZE; ret = ib_query_port(device, port, &attr); if (ret) return ret; if (rdma_protocol_ib(device, port)) { BUILD_BUG_ON((sizeof(attr.port_cap_flags) + sizeof(attr.port_cap_flags2)) > sizeof(u64)); cap_flags = attr.port_cap_flags | ((u64)attr.port_cap_flags2 << 32); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, cap_flags, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) return -EMSGSIZE; } if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; netdev = ib_device_get_netdev(device, port); if (netdev && net_eq(dev_net(netdev), net)) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); } out: dev_put(netdev); return ret; } static int fill_res_info_entry(struct sk_buff *msg, const char *name, u64 curr) { struct nlattr *entry_attr; entry_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_res_info(struct sk_buff *msg, struct ib_device *device, bool show_details) { static const char * const names[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_PD] = "pd", [RDMA_RESTRACK_CQ] = "cq", [RDMA_RESTRACK_QP] = "qp", [RDMA_RESTRACK_CM_ID] = "cm_id", [RDMA_RESTRACK_MR] = "mr", [RDMA_RESTRACK_CTX] = "ctx", [RDMA_RESTRACK_SRQ] = "srq", }; struct nlattr *table_attr; int ret, i, curr; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); if (!table_attr) return -EMSGSIZE; for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; curr = rdma_restrack_count(device, i, show_details); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; } nla_nest_end(msg, table_attr); return 0; err: nla_nest_cancel(msg, table_attr); return ret; } static int fill_res_name_pid(struct sk_buff *msg, struct rdma_restrack_entry *res) { int err = 0; /* * For user resources, user is should read /proc/PID/comm to get the * name of the task file. */ if (rdma_is_kernel_res(res)) { err = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name); } else { pid_t pid; pid = task_pid_vnr(res->task); /* * Task is dead and in zombie state. * There is no need to print PID anymore. */ if (pid) /* * This part is racy, task can be killed and PID will * be zero right here but it is ok, next query won't * return PID. We don't promise real-time reflection * of SW objects. */ err = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, pid); } return err ? -EMSGSIZE : 0; } static int fill_res_qp_entry_query(struct sk_buff *msg, struct rdma_restrack_entry *res, struct ib_device *dev, struct ib_qp *qp) { struct ib_qp_init_attr qp_init_attr; struct ib_qp_attr qp_attr; int ret; ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr); if (ret) return ret; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, qp_attr.dest_qp_num)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN, qp_attr.rq_psn)) goto err; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn)) goto err; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC || qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) { if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, qp_attr.path_mig_state)) goto err; } if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; if (dev->ops.fill_res_qp_entry) return dev->ops.fill_res_qp_entry(msg, qp); return 0; err: return -EMSGSIZE; } static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); struct ib_device *dev = qp->device; int ret; if (port && port != qp->port) return -EAGAIN; /* In create_qp() port is not set yet */ if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) return -EMSGSIZE; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); if (ret) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) return -EMSGSIZE; ret = fill_res_name_pid(msg, res); if (ret) return -EMSGSIZE; return fill_res_qp_entry_query(msg, res, dev, qp); } static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); struct ib_device *dev = qp->device; if (port && port != qp->port) return -EAGAIN; if (!dev->ops.fill_res_qp_entry_raw) return -EINVAL; return dev->ops.fill_res_qp_entry_raw(msg, qp); } static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); struct ib_device *dev = id_priv->id.device; struct rdma_cm_id *cm_id = &id_priv->id; if (port && port != cm_id->port_num) return -EAGAIN; if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) goto err; if (id_priv->qp_num) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) goto err; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) goto err; if (cm_id->route.addr.src_addr.ss_family && nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR, sizeof(cm_id->route.addr.src_addr), &cm_id->route.addr.src_addr)) goto err; if (cm_id->route.addr.dst_addr.ss_family && nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR, sizeof(cm_id->route.addr.dst_addr), &cm_id->route.addr.dst_addr)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) goto err; if (fill_res_name_pid(msg, res)) goto err; if (dev->ops.fill_res_cm_id_entry) return dev->ops.fill_res_cm_id_entry(msg, cm_id); return 0; err: return -EMSGSIZE; } static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; /* Poll context is only valid for kernel CQs */ if (rdma_is_kernel_res(res) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, cq->uobject->uevent.uobject.context->res.id)) return -EMSGSIZE; if (fill_res_name_pid(msg, res)) return -EMSGSIZE; return (dev->ops.fill_res_cq_entry) ? dev->ops.fill_res_cq_entry(msg, cq) : 0; } static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); struct ib_device *dev = cq->device; if (!dev->ops.fill_res_cq_entry_raw) return -EINVAL; return dev->ops.fill_res_cq_entry_raw(msg, cq); } static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) return -EMSGSIZE; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) return -EMSGSIZE; if (fill_res_name_pid(msg, res)) return -EMSGSIZE; return (dev->ops.fill_res_mr_entry) ? dev->ops.fill_res_mr_entry(msg, mr) : 0; } static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (!dev->ops.fill_res_mr_entry_raw) return -EINVAL; return dev->ops.fill_res_mr_entry_raw(msg, mr); } static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, pd->local_dma_lkey)) goto err; if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, pd->unsafe_global_rkey)) goto err; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) goto err; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, pd->uobject->context->res.id)) goto err; return fill_res_name_pid(msg, res); err: return -EMSGSIZE; } static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res); if (rdma_is_kernel_res(res)) return 0; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id)) return -EMSGSIZE; return fill_res_name_pid(msg, res); } static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range, uint32_t max_range) { struct nlattr *entry_attr; if (!min_range) return 0; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); if (!entry_attr) return -EMSGSIZE; if (min_range == max_range) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range)) goto err; } else { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range)) goto err; } nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq) { uint32_t min_range = 0, prev = 0; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct nlattr *table_attr; struct ib_qp *qp = NULL; unsigned long id = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); if (!table_attr) return -EMSGSIZE; rt = &srq->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; qp = container_of(res, struct ib_qp, res); if (!qp->srq || (qp->srq->res.id != srq->res.id)) { rdma_restrack_put(res); continue; } if (qp->qp_num < prev) /* qp_num should be ascending */ goto err_loop; if (min_range == 0) { min_range = qp->qp_num; } else if (qp->qp_num > (prev + 1)) { if (fill_res_range_qp_entry(msg, min_range, prev)) goto err_loop; min_range = qp->qp_num; } prev = qp->qp_num; rdma_restrack_put(res); } xa_unlock(&rt->xa); if (fill_res_range_qp_entry(msg, min_range, prev)) goto err; nla_nest_end(msg, table_attr); return 0; err_loop: rdma_restrack_put(res); xa_unlock(&rt->xa); err: nla_nest_cancel(msg, table_attr); return -EMSGSIZE; } static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_srq *srq = container_of(res, struct ib_srq, res); struct ib_device *dev = srq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id)) goto err; if (ib_srq_has_cq(srq->srq_type)) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, srq->ext.cq->res.id)) goto err; } if (fill_res_srq_qps(msg, srq)) goto err; if (fill_res_name_pid(msg, res)) goto err; if (dev->ops.fill_res_srq_entry) return dev->ops.fill_res_srq_entry(msg, srq); return 0; err: return -EMSGSIZE; } static int fill_res_srq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_srq *srq = container_of(res, struct ib_srq, res); struct ib_device *dev = srq->device; if (!dev->ops.fill_res_srq_entry_raw) return -EINVAL; return dev->ops.fill_res_srq_entry_raw(msg, srq); } static int fill_stat_counter_mode(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_counter_mode *m = &counter->mode; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) return -EMSGSIZE; if (m->mode == RDMA_COUNTER_MODE_AUTO) { if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) return -EMSGSIZE; if ((m->mask & RDMA_COUNTER_MASK_PID) && fill_res_name_pid(msg, &counter->res)) return -EMSGSIZE; } return 0; } static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) { struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_stat_counter_qps(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct nlattr *table_attr; struct ib_qp *qp = NULL; unsigned long id = 0; int ret = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); if (!table_attr) return -EMSGSIZE; rt = &counter->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { qp = container_of(res, struct ib_qp, res); if (!qp->counter || (qp->counter->id != counter->id)) continue; ret = fill_stat_counter_qp_entry(msg, qp->qp_num); if (ret) goto err; } xa_unlock(&rt->xa); nla_nest_end(msg, table_attr); return 0; err: xa_unlock(&rt->xa); nla_nest_cancel(msg, table_attr); return ret; } int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name, u64 value) { struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, name)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, value, RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } EXPORT_SYMBOL(rdma_nl_stat_hwcounter_entry); static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) goto err; if (dev->ops.fill_stat_mr_entry) return dev->ops.fill_stat_mr_entry(msg, mr); return 0; err: return -EMSGSIZE; } static int fill_stat_counter_hwcounters(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_hw_stats *st = counter->stats; struct nlattr *table_attr; int i; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table_attr) return -EMSGSIZE; mutex_lock(&st->lock); for (i = 0; i < st->num_counters; i++) { if (test_bit(i, st->is_disabled)) continue; if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name, st->value[i])) goto err; } mutex_unlock(&st->lock); nla_nest_end(msg, table_attr); return 0; err: mutex_unlock(&st->lock); nla_nest_cancel(msg, table_attr); return -EMSGSIZE; } static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_counter *counter = container_of(res, struct rdma_counter, res); if (port && port != counter->port) return -EAGAIN; /* Dump it even query failed */ rdma_counter_query_stats(counter); if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || fill_stat_counter_mode(msg, counter) || fill_stat_counter_qps(msg, counter) || fill_stat_counter_hwcounters(msg, counter)) return -EMSGSIZE; return 0; } static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); if (!nlh) { err = -EMSGSIZE; goto err_free; } err = fill_dev_info(msg, device); if (err) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return err; } static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 index; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { char name[IB_DEVICE_NAME_MAX] = {}; nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], IB_DEVICE_NAME_MAX); if (strlen(name) == 0) { err = -EINVAL; goto done; } err = ib_device_rename(device, name); goto done; } if (tb[RDMA_NLDEV_NET_NS_FD]) { u32 ns_fd; ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]); err = ib_device_set_netns_put(skb, device, ns_fd); goto put_done; } if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { u8 use_dim; use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); err = ib_device_set_dim(device, use_dim); goto done; } done: ib_device_put(device); put_done: return err; } static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, unsigned int idx) { int start = cb->args[0]; struct nlmsghdr *nlh; if (idx < start) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, NLM_F_MULTI); if (!nlh || fill_dev_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); idx++; out: cb->args[0] = idx; return skb->len; } static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { /* * There is no need to take lock, because * we are relying on ib_core's locking. */ return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); } static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index; u32 port; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { err = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); if (!nlh) { err = -EMSGSIZE; goto err_free; } err = fill_port_info(msg, device, port, sock_net(skb->sk)); if (err) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return err; } static int nldev_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; int start = cb->args[0]; struct nlmsghdr *nlh; u32 idx = 0; u32 ifindex; int err; unsigned int p; err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), ifindex); if (!device) return -EINVAL; rdma_for_each_port (device, p) { /* * The dumpit function returns all information from specific * index. This specific index is taken from the netlink * messages request sent by user and it is available * in cb->args[0]. * * Usually, the user doesn't fill this field and it causes * to return everything. * */ if (idx < start) { idx++; continue; } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET), 0, NLM_F_MULTI); if (!nlh || fill_port_info(skb, device, p, sock_net(skb->sk))) { nlmsg_cancel(skb, nlh); goto out; } idx++; nlmsg_end(skb, nlh); } out: ib_device_put(device); cb->args[0] = idx; return skb->len; } static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; bool show_details = false; struct ib_device *device; struct sk_buff *msg; u32 index; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_free; } ret = fill_res_info(msg, device, show_details); if (ret) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int _nldev_res_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, unsigned int idx) { int start = cb->args[0]; struct nlmsghdr *nlh; if (idx < start) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); if (!nlh || fill_res_info(skb, device, false)) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); idx++; out: cb->args[0] = idx; return skb->len; } static int nldev_res_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); } struct nldev_fill_res_entry { enum rdma_nldev_attr nldev_attr; u8 flags; u32 entry; u32 id; }; enum nldev_res_flags { NLDEV_PER_DEV = 1 << 0, }; static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, [RDMA_RESTRACK_COUNTER] = { .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, }, [RDMA_RESTRACK_CTX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CTXN, }, [RDMA_RESTRACK_SRQ] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_SRQN, }, }; static noinline_for_stack int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, enum rdma_restrack_type res_type, res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; struct ib_device *device; u32 index, id, port = 0; bool has_cap_net_admin; struct sk_buff *msg; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } } if ((port && fe->flags & NLDEV_PER_DEV) || (!port && ~fe->flags & NLDEV_PER_DEV)) { ret = -EINVAL; goto err; } id = nla_get_u32(tb[fe->id]); res = rdma_restrack_get_byid(device, res_type, id); if (IS_ERR(res)) { ret = PTR_ERR(res); goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_get; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); if (!nlh || fill_nldev_handle(msg, device)) { ret = -EMSGSIZE; goto err_free; } has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); ret = fill_func(msg, has_cap_net_admin, res, port); if (ret) goto err_free; rdma_restrack_put(res); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err_get: rdma_restrack_put(res); err: ib_device_put(device); return ret; } static int res_get_common_dumpit(struct sk_buff *skb, struct netlink_callback *cb, enum rdma_restrack_type res_type, res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; bool show_details = false; struct nlattr *table_attr; struct nlattr *entry_attr; struct ib_device *device; int start = cb->args[0]; bool has_cap_net_admin; struct nlmsghdr *nlh; unsigned long id; u32 index, port = 0; bool filled = false; err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); /* * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. * if it doesn't exist, we will iterate over all devices. * * But it is not needed for now. */ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); /* * If no PORT_INDEX is supplied, we will return all QPs from that device */ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err_index; } } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); if (!nlh || fill_nldev_handle(skb, device)) { ret = -EMSGSIZE; goto err; } table_attr = nla_nest_start_noflag(skb, fe->nldev_attr); if (!table_attr) { ret = -EMSGSIZE; goto err; } has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); rt = &device->res[res_type]; xa_lock(&rt->xa); /* * FIXME: if the skip ahead is something common this loop should * use xas_for_each & xas_pause to optimize, we can have a lot of * objects. */ xa_for_each(&rt->xa, id, res) { if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details) goto next; if (idx < start || !rdma_restrack_get(res)) goto next; xa_unlock(&rt->xa); filled = true; entry_attr = nla_nest_start_noflag(skb, fe->entry); if (!entry_attr) { ret = -EMSGSIZE; rdma_restrack_put(res); goto msg_full; } ret = fill_func(skb, has_cap_net_admin, res, port); rdma_restrack_put(res); if (ret) { nla_nest_cancel(skb, entry_attr); if (ret == -EMSGSIZE) goto msg_full; if (ret == -EAGAIN) goto again; goto res_err; } nla_nest_end(skb, entry_attr); again: xa_lock(&rt->xa); next: idx++; } xa_unlock(&rt->xa); msg_full: nla_nest_end(skb, table_attr); nlmsg_end(skb, nlh); cb->args[0] = idx; /* * No more entries to fill, cancel the message and * return 0 to mark end of dumpit. */ if (!filled) goto err; ib_device_put(device); return skb->len; res_err: nla_nest_cancel(skb, table_attr); err: nlmsg_cancel(skb, nlh); err_index: ib_device_put(device); return ret; } #define RES_GET_FUNCS(name, type) \ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ struct netlink_callback *cb) \ { \ return res_get_common_dumpit(skb, cb, type, \ fill_res_##name##_entry); \ } \ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ struct nlmsghdr *nlh, \ struct netlink_ext_ack *extack) \ { \ return res_get_common_doit(skb, nlh, extack, type, \ fill_res_##name##_entry); \ } RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX); RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ); RES_GET_FUNCS(srq_raw, RDMA_RESTRACK_SRQ); static LIST_HEAD(link_ops); static DECLARE_RWSEM(link_ops_rwsem); static const struct rdma_link_ops *link_ops_get(const char *type) { const struct rdma_link_ops *ops; list_for_each_entry(ops, &link_ops, list) { if (!strcmp(ops->type, type)) goto out; } ops = NULL; out: return ops; } void rdma_link_register(struct rdma_link_ops *ops) { down_write(&link_ops_rwsem); if (WARN_ON_ONCE(link_ops_get(ops->type))) goto out; list_add(&ops->list, &link_ops); out: up_write(&link_ops_rwsem); } EXPORT_SYMBOL(rdma_link_register); void rdma_link_unregister(struct rdma_link_ops *ops) { down_write(&link_ops_rwsem); list_del(&ops->list); up_write(&link_ops_rwsem); } EXPORT_SYMBOL(rdma_link_unregister); static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; char ibdev_name[IB_DEVICE_NAME_MAX]; const struct rdma_link_ops *ops; char ndev_name[IFNAMSIZ]; struct net_device *ndev; char type[IFNAMSIZ]; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(ibdev_name)); if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) return -EINVAL; nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], sizeof(ndev_name)); ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); if (!ndev) return -ENODEV; down_read(&link_ops_rwsem); ops = link_ops_get(type); #ifdef CONFIG_MODULES if (!ops) { up_read(&link_ops_rwsem); request_module("rdma-link-%s", type); down_read(&link_ops_rwsem); ops = link_ops_get(type); } #endif err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; up_read(&link_ops_rwsem); dev_put(ndev); return err; } static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 index; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) { ib_device_put(device); return -EINVAL; } ib_unregister_device_and_put(device); return 0; } static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; struct ib_client_nl_info data = {}; struct ib_device *ibdev = NULL; struct sk_buff *msg; u32 index; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], sizeof(client_name)); if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); ibdev = ib_device_get_by_index(sock_net(skb->sk), index); if (!ibdev) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(ibdev, data.port)) { err = -EINVAL; goto out_put; } } else { data.port = -1; } } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { return -EINVAL; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out_put; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET_CHARDEV), 0, 0); if (!nlh) { err = -EMSGSIZE; goto out_nlmsg; } data.nl_msg = msg; err = ib_get_client_nl_info(ibdev, client_name, &data); if (err) goto out_nlmsg; err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, huge_encode_dev(data.cdev->devt), RDMA_NLDEV_ATTR_PAD); if (err) goto out_data; err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, RDMA_NLDEV_ATTR_PAD); if (err) goto out_data; if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, dev_name(data.cdev))) { err = -EMSGSIZE; goto out_data; } nlmsg_end(msg, nlh); put_device(data.cdev); if (ibdev) ib_device_put(ibdev); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); out_data: put_device(data.cdev); out_nlmsg: nlmsg_free(msg); out_put: if (ibdev) ib_device_put(ibdev); return err; } static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct sk_buff *msg; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SYS_GET), 0, 0); if (!nlh) { nlmsg_free(msg); return -EMSGSIZE; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, (u8)ib_devices_shared_netns); if (err) { nlmsg_free(msg); return err; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE, (u8)privileged_qkey); if (err) { nlmsg_free(msg); return err; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1); if (err) { nlmsg_free(msg); return err; } /* * Copy-on-fork is supported. * See commits: * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm") * for more details. Don't backport this without them. * * Return value ignored on purpose, assume copy-on-fork is not * supported in case of failure. */ nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1); nlmsg_end(msg, nlh); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); } static int nldev_set_sys_set_netns_doit(struct nlattr *tb[]) { u8 enable; int err; enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]); /* Only 0 and 1 are supported */ if (enable > 1) return -EINVAL; err = rdma_compatdev_set(enable); return err; } static int nldev_set_sys_set_pqkey_doit(struct nlattr *tb[]) { u8 enable; enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]); /* Only 0 and 1 are supported */ if (enable > 1) return -EINVAL; privileged_qkey = enable; return 0; } static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err) return -EINVAL; if (tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]) return nldev_set_sys_set_netns_doit(tb); if (tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]) return nldev_set_sys_set_pqkey_doit(tb); return -EINVAL; } static int nldev_stat_set_mode_doit(struct sk_buff *msg, struct netlink_ext_ack *extack, struct nlattr *tb[], struct ib_device *device, u32 port) { u32 mode, mask = 0, qpn, cntn = 0; bool opcnt = false; int ret; /* Currently only counter for QP is supported */ if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]) opcnt = !!nla_get_u8( tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]); mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); if (mode == RDMA_COUNTER_MODE_AUTO) { if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) mask = nla_get_u32( tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); return rdma_counter_set_auto_mode(device, port, mask, opcnt, extack); } if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) return -EINVAL; qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); ret = rdma_counter_bind_qpn(device, port, qpn, cntn); if (ret) return ret; } else { ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn); if (ret) return ret; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { ret = -EMSGSIZE; goto err_fill; } return 0; err_fill: rdma_counter_unbind_qpn(device, port, qpn, cntn); return ret; } static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[], struct ib_device *device, u32 port) { struct rdma_hw_stats *stats; struct nlattr *entry_attr; unsigned long *target; int rem, i, ret = 0; u32 index; stats = ib_get_hw_stats_port(device, port); if (!stats) return -EINVAL; target = kcalloc(BITS_TO_LONGS(stats->num_counters), sizeof(*stats->is_disabled), GFP_KERNEL); if (!target) return -ENOMEM; nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS], rem) { index = nla_get_u32(entry_attr); if ((index >= stats->num_counters) || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) { ret = -EINVAL; goto out; } set_bit(index, target); } for (i = 0; i < stats->num_counters; i++) { if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)) continue; ret = rdma_counter_modify(device, port, i, test_bit(i, target)); if (ret) goto out; } out: kfree(target); return ret; } static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index, port; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err_put_device; } if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] && !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { ret = -EINVAL; goto err_put_device; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_put_device; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_free_msg; } if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) { ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port); if (ret) goto err_free_msg; } if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { ret = nldev_stat_set_counter_dynamic_doit(tb, device, port); if (ret) goto err_free_msg; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free_msg: nlmsg_free(msg); err_put_device: ib_device_put(device); return ret; } static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index, port, qpn, cntn; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || !tb[RDMA_NLDEV_ATTR_RES_LQPN]) return -EINVAL; if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_fill; } cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); if (fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { ret = -EMSGSIZE; goto err_fill; } ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); if (ret) goto err_fill; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_fill: nlmsg_free(msg); err: ib_device_put(device); return ret; } static noinline_for_stack int stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr *tb[]) { struct rdma_hw_stats *stats; struct nlattr *table_attr; struct ib_device *device; int ret, num_cnts, i; struct sk_buff *msg; u32 index, port; u64 v; if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (!device->ops.alloc_hw_port_stats || !device->ops.get_hw_stats) { ret = -EINVAL; goto err; } port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); stats = ib_get_hw_stats_port(device, port); if (!stats) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_msg; } mutex_lock(&stats->lock); num_cnts = device->ops.get_hw_stats(device, stats, port, 0); if (num_cnts < 0) { ret = -EINVAL; goto err_stats; } table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table_attr) { ret = -EMSGSIZE; goto err_stats; } for (i = 0; i < num_cnts; i++) { if (test_bit(i, stats->is_disabled)) continue; v = stats->value[i] + rdma_counter_get_hwstat_value(device, port, i); if (rdma_nl_stat_hwcounter_entry(msg, stats->descs[i].name, v)) { ret = -EMSGSIZE; goto err_table; } } nla_nest_end(msg, table_attr); mutex_unlock(&stats->lock); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_table: nla_nest_cancel(msg, table_attr); err_stats: mutex_unlock(&stats->lock); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static noinline_for_stack int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr *tb[]) { static enum rdma_nl_counter_mode mode; static enum rdma_nl_counter_mask mask; struct ib_device *device; struct sk_buff *msg; u32 index, port; bool opcnt; int ret; if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) return nldev_res_get_counter_doit(skb, nlh, extack); if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_msg; } ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt); if (ret) goto err_msg; if (fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) { ret = -EMSGSIZE; goto err_msg; } if ((mode == RDMA_COUNTER_MODE_AUTO) && nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { ret = -EMSGSIZE; goto err_msg; } if ((mode == RDMA_COUNTER_MODE_AUTO) && nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) { ret = -EMSGSIZE; goto err_msg; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret) return -EINVAL; if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) return stat_get_doit_default_counter(skb, nlh, extack, tb); switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: ret = stat_get_doit_qp(skb, nlh, extack, tb); break; case RDMA_NLDEV_ATTR_RES_MR: ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR, fill_stat_mr_entry); break; default: ret = -EINVAL; break; } return ret; } static int nldev_stat_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) return -EINVAL; switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: ret = nldev_res_get_counter_dumpit(skb, cb); break; case RDMA_NLDEV_ATTR_RES_MR: ret = res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR, fill_stat_mr_entry); break; default: ret = -EINVAL; break; } return ret; } static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; struct rdma_hw_stats *stats; struct ib_device *device; struct sk_buff *msg; u32 devid, port; int ret, i; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), devid); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } stats = ib_get_hw_stats_port(device, port); if (!stats) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put( msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS), 0, 0); ret = -EMSGSIZE; if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) goto err_msg; table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table) goto err_msg; mutex_lock(&stats->lock); for (i = 0; i < stats->num_counters; i++) { entry = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); if (!entry) goto err_msg_table; if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, stats->descs[i].name) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i)) goto err_msg_entry; if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) && (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, !test_bit(i, stats->is_disabled)))) goto err_msg_entry; nla_nest_end(msg, entry); } mutex_unlock(&stats->lock); nla_nest_end(msg, table); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg_entry: nla_nest_cancel(msg, entry); err_msg_table: mutex_unlock(&stats->lock); nla_nest_cancel(msg, table); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; enum rdma_nl_dev_type type; struct ib_device *parent; char name[IFNAMSIZ] = {}; u32 parentid; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE]) return -EINVAL; nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name)); type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]); parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); parent = ib_device_get_by_index(sock_net(skb->sk), parentid); if (!parent) return -EINVAL; ret = ib_add_sub_device(parent, type, name); ib_device_put(parent); return ret; } static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 devid; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), devid); if (!device) return -EINVAL; return ib_del_sub_device_and_put(device); } static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, [RDMA_NLDEV_CMD_GET_CHARDEV] = { .doit = nldev_get_chardev, }, [RDMA_NLDEV_CMD_SET] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_NEWLINK] = { .doit = nldev_newlink, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_DELLINK] = { .doit = nldev_dellink, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, }, [RDMA_NLDEV_CMD_RES_GET] = { .doit = nldev_res_get_doit, .dump = nldev_res_get_dumpit, }, [RDMA_NLDEV_CMD_RES_QP_GET] = { .doit = nldev_res_get_qp_doit, .dump = nldev_res_get_qp_dumpit, }, [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { .doit = nldev_res_get_cm_id_doit, .dump = nldev_res_get_cm_id_dumpit, }, [RDMA_NLDEV_CMD_RES_CQ_GET] = { .doit = nldev_res_get_cq_doit, .dump = nldev_res_get_cq_dumpit, }, [RDMA_NLDEV_CMD_RES_MR_GET] = { .doit = nldev_res_get_mr_doit, .dump = nldev_res_get_mr_dumpit, }, [RDMA_NLDEV_CMD_RES_PD_GET] = { .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, [RDMA_NLDEV_CMD_RES_CTX_GET] = { .doit = nldev_res_get_ctx_doit, .dump = nldev_res_get_ctx_dumpit, }, [RDMA_NLDEV_CMD_RES_SRQ_GET] = { .doit = nldev_res_get_srq_doit, .dump = nldev_res_get_srq_dumpit, }, [RDMA_NLDEV_CMD_SYS_GET] = { .doit = nldev_sys_get_doit, }, [RDMA_NLDEV_CMD_SYS_SET] = { .doit = nldev_set_sys_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_SET] = { .doit = nldev_stat_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_GET] = { .doit = nldev_stat_get_doit, .dump = nldev_stat_get_dumpit, }, [RDMA_NLDEV_CMD_STAT_DEL] = { .doit = nldev_stat_del_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { .doit = nldev_res_get_qp_raw_doit, .dump = nldev_res_get_qp_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { .doit = nldev_res_get_cq_raw_doit, .dump = nldev_res_get_cq_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { .doit = nldev_res_get_mr_raw_doit, .dump = nldev_res_get_mr_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_SRQ_GET_RAW] = { .doit = nldev_res_get_srq_raw_doit, .dump = nldev_res_get_srq_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { .doit = nldev_stat_get_counter_status_doit, }, [RDMA_NLDEV_CMD_NEWDEV] = { .doit = nldev_newdev, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_DELDEV] = { .doit = nldev_deldev, .flags = RDMA_NL_ADMIN_PERM, }, }; static int fill_mon_netdev_rename(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = ib_device_get_netdev(device, port); int ret = 0; if (!netdev || !net_eq(dev_net(netdev), net)) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); out: dev_put(netdev); return ret; } static int fill_mon_netdev_association(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = ib_device_get_netdev(device, port); int ret = 0; if (netdev && !net_eq(dev_net(netdev), net)) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, dev_name(&device->dev)); if (ret) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); if (ret) goto out; if (netdev) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); } out: dev_put(netdev); return ret; } static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct net_device *netdev; switch (type) { case RDMA_REGISTER_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor register device event\n"); break; case RDMA_UNREGISTER_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor unregister device event\n"); break; case RDMA_NETDEV_ATTACH_EVENT: netdev = ib_device_get_netdev(device, port_num); dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", port_num, netdev->ifindex); dev_put(netdev); break; case RDMA_NETDEV_DETACH_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev detach event: port %d\n", port_num); break; case RDMA_RENAME_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor rename device event\n"); break; case RDMA_NETDEV_RENAME_EVENT: netdev = ib_device_get_netdev(device, port_num); dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n", port_num, netdev->ifindex); dev_put(netdev); break; default: break; } } int rdma_nl_notify_event(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct sk_buff *skb; int ret = -EMSGSIZE; struct net *net; void *nlh; net = read_pnet(&device->coredev.rdma_net); if (!net) return -EINVAL; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), 0, 0); if (!nlh) goto err_free; switch (type) { case RDMA_REGISTER_EVENT: case RDMA_UNREGISTER_EVENT: case RDMA_RENAME_EVENT: ret = fill_nldev_handle(skb, device); if (ret) goto err_free; break; case RDMA_NETDEV_ATTACH_EVENT: case RDMA_NETDEV_DETACH_EVENT: ret = fill_mon_netdev_association(skb, device, port_num, net); if (ret) goto err_free; break; case RDMA_NETDEV_RENAME_EVENT: ret = fill_mon_netdev_rename(skb, device, port_num, net); if (ret) goto err_free; break; default: break; } ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); if (ret) goto err_free; nlmsg_end(skb, nlh); ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); if (ret && ret != -ESRCH) { skb = NULL; /* skb is freed in the netlink send-op handling */ goto err_free; } return 0; err_free: rdma_nl_notify_err_msg(device, port_num, type); nlmsg_free(skb); return ret; } void __init nldev_init(void) { rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); } void nldev_exit(void) { rdma_nl_unregister(RDMA_NL_NLDEV); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5);
59 58 59 59 59 59 59 59 59 59 59 59 59 59 59 59 58 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023 * * Design * ====== * * See https://www.chronox.de/jent.html * * License * ======= * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * This Jitterentropy RNG is based on the jitterentropy library * version 3.4.0 provided at https://www.chronox.de/jent.html */ #ifdef __OPTIMIZE__ #error "The CPU Jitter random number generator must not be compiled with optimizations. See documentation. Use the compiler switch -O0 for compiling jitterentropy.c." #endif typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { /* SHA3-256 is used as conditioner */ #define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ void *hash_state; /* SENSITIVE hash state entropy pool */ __u64 prev_time; /* SENSITIVE Previous time stamp */ __u64 last_delta; /* SENSITIVE stuck test */ __s64 last_delta2; /* SENSITIVE stuck test */ unsigned int flags; /* Flags used to initialize */ unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_ACCESSLOOPS 128 #define JENT_MEMORY_SIZE \ (CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS * \ CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE) unsigned char *mem; /* Memory access location with size of * memblocks * memblocksize */ unsigned int memlocation; /* Pointer to byte in *mem */ unsigned int memblocks; /* Number of memory blocks in *mem */ unsigned int memblocksize; /* Size of one memory block in bytes */ unsigned int memaccessloops; /* Number of memory accesses per random * bit generation */ /* Repetition Count Test */ unsigned int rct_count; /* Number of stuck values */ /* Adaptive Proportion Test cutoff values */ unsigned int apt_cutoff; /* Intermittent health test failure */ unsigned int apt_cutoff_permanent; /* Permanent health test failure */ #define JENT_APT_WINDOW_SIZE 512 /* Data window size */ /* LSB of time stamp to process */ #define JENT_APT_LSB 16 #define JENT_APT_WORD_MASK (JENT_APT_LSB - 1) unsigned int apt_observations; /* Number of collected observations */ unsigned int apt_count; /* APT counter */ unsigned int apt_base; /* APT base reference */ unsigned int health_failure; /* Record health failure */ unsigned int apt_base_set:1; /* APT base reference set? */ }; /* Flags that can be used to initialize the RNG */ #define JENT_DISABLE_MEMORY_ACCESS (1<<2) /* Disable memory access for more * entropy, saves MEMORY_SIZE RAM for * entropy collector */ /* -- error codes for init function -- */ #define JENT_ENOTIME 1 /* Timer service not available */ #define JENT_ECOARSETIME 2 /* Timer too coarse for RNG */ #define JENT_ENOMONOTONIC 3 /* Timer is not monotonic increasing */ #define JENT_EVARVAR 5 /* Timer does not produce variations of * variations (2nd derivation of time is * zero). */ #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ #define JENT_ERCT 10 /* RCT failed during initialization */ #define JENT_EHASH 11 /* Hash self test failed */ #define JENT_EMEM 12 /* Can't allocate memory for initialization */ #define JENT_RCT_FAILURE 1 /* Failure in RCT health test. */ #define JENT_APT_FAILURE 2 /* Failure in APT health test. */ #define JENT_PERMANENT_FAILURE_SHIFT 16 #define JENT_PERMANENT_FAILURE(x) (x << JENT_PERMANENT_FAILURE_SHIFT) #define JENT_RCT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_RCT_FAILURE) #define JENT_APT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_APT_FAILURE) /* * The output n bits can receive more than n bits of min entropy, of course, * but the fixed output of the conditioning function can only asymptotically * approach the output size bits of min entropy, not attain that bound. Random * maps will tend to have output collisions, which reduces the creditable * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). * * The value "64" is justified in Appendix A.4 of the current 90C draft, * and aligns with NIST's in "epsilon" definition in this document, which is * that a string can be considered "full entropy" if you can bound the min * entropy in each bit of output to at least 1-epsilon, where epsilon is * required to be <= 2^(-32). */ #define JENT_ENTROPY_SAFETY_FACTOR 64 #include <linux/array_size.h> #include <linux/fips.h> #include <linux/minmax.h> #include "jitterentropy.h" /*************************************************************************** * Adaptive Proportion Test * * This test complies with SP800-90B section 4.4.2. ***************************************************************************/ /* * See the SP 800-90B comment #10b for the corrected cutoff for the SP 800-90B * APT. * https://www.untruth.org/~josh/sp80090b/UL%20SP800-90B-final%20comments%20v1.9%2020191212.pdf * In the syntax of R, this is C = 2 + qbinom(1 − 2^(−30), 511, 2^(-1/osr)). * (The original formula wasn't correct because the first symbol must * necessarily have been observed, so there is no chance of observing 0 of these * symbols.) * * For the alpha < 2^-53, R cannot be used as it uses a float data type without * arbitrary precision. A SageMath script is used to calculate those cutoff * values. * * For any value above 14, this yields the maximal allowable value of 512 * (by FIPS 140-2 IG 7.19 Resolution # 16, we cannot choose a cutoff value that * renders the test unable to fail). */ static const unsigned int jent_apt_cutoff_lookup[15] = { 325, 422, 459, 477, 488, 494, 499, 502, 505, 507, 508, 509, 510, 511, 512 }; static const unsigned int jent_apt_cutoff_permanent_lookup[15] = { 355, 447, 479, 494, 502, 507, 510, 512, 512, 512, 512, 512, 512, 512, 512 }; static void jent_apt_init(struct rand_data *ec, unsigned int osr) { /* * Establish the apt_cutoff based on the presumed entropy rate of * 1/osr. */ if (osr >= ARRAY_SIZE(jent_apt_cutoff_lookup)) { ec->apt_cutoff = jent_apt_cutoff_lookup[ ARRAY_SIZE(jent_apt_cutoff_lookup) - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[ ARRAY_SIZE(jent_apt_cutoff_permanent_lookup) - 1]; } else { ec->apt_cutoff = jent_apt_cutoff_lookup[osr - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[osr - 1]; } } /* * Reset the APT counter * * @ec [in] Reference to entropy collector */ static void jent_apt_reset(struct rand_data *ec, unsigned int delta_masked) { /* Reset APT counter */ ec->apt_count = 0; ec->apt_base = delta_masked; ec->apt_observations = 0; } /* * Insert a new entropy event into APT * * @ec [in] Reference to entropy collector * @delta_masked [in] Masked time delta to process */ static void jent_apt_insert(struct rand_data *ec, unsigned int delta_masked) { /* Initialize the base reference */ if (!ec->apt_base_set) { ec->apt_base = delta_masked; ec->apt_base_set = 1; return; } if (delta_masked == ec->apt_base) { ec->apt_count++; /* Note, ec->apt_count starts with one. */ if (ec->apt_count >= ec->apt_cutoff_permanent) ec->health_failure |= JENT_APT_FAILURE_PERMANENT; else if (ec->apt_count >= ec->apt_cutoff) ec->health_failure |= JENT_APT_FAILURE; } ec->apt_observations++; if (ec->apt_observations >= JENT_APT_WINDOW_SIZE) jent_apt_reset(ec, delta_masked); } /*************************************************************************** * Stuck Test and its use as Repetition Count Test * * The Jitter RNG uses an enhanced version of the Repetition Count Test * (RCT) specified in SP800-90B section 4.4.1. Instead of counting identical * back-to-back values, the input to the RCT is the counting of the stuck * values during the generation of one Jitter RNG output block. * * The RCT is applied with an alpha of 2^{-30} compliant to FIPS 140-2 IG 9.8. * * During the counting operation, the Jitter RNG always calculates the RCT * cut-off value of C. If that value exceeds the allowed cut-off value, * the Jitter RNG output block will be calculated completely but discarded at * the end. The caller of the Jitter RNG is informed with an error code. ***************************************************************************/ /* * Repetition Count Test as defined in SP800-90B section 4.4.1 * * @ec [in] Reference to entropy collector * @stuck [in] Indicator whether the value is stuck */ static void jent_rct_insert(struct rand_data *ec, int stuck) { if (stuck) { ec->rct_count++; /* * The cutoff value is based on the following consideration: * alpha = 2^-30 or 2^-60 as recommended in SP800-90B. * In addition, we require an entropy value H of 1/osr as this * is the minimum entropy required to provide full entropy. * Note, we collect (DATA_SIZE_BITS + ENTROPY_SAFETY_FACTOR)*osr * deltas for inserting them into the entropy pool which should * then have (close to) DATA_SIZE_BITS bits of entropy in the * conditioned output. * * Note, ec->rct_count (which equals to value B in the pseudo * code of SP800-90B section 4.4.1) starts with zero. Hence * we need to subtract one from the cutoff value as calculated * following SP800-90B. Thus C = ceil(-log_2(alpha)/H) = 30*osr * or 60*osr. */ if ((unsigned int)ec->rct_count >= (60 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE_PERMANENT; } else if ((unsigned int)ec->rct_count >= (30 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE; } } else { /* Reset RCT */ ec->rct_count = 0; } } static inline __u64 jent_delta(__u64 prev, __u64 next) { #define JENT_UINT64_MAX (__u64)(~((__u64) 0)) return (prev < next) ? (next - prev) : (JENT_UINT64_MAX - prev + 1 + next); } /* * Stuck test by checking the: * 1st derivative of the jitter measurement (time delta) * 2nd derivative of the jitter measurement (delta of time deltas) * 3rd derivative of the jitter measurement (delta of delta of time deltas) * * All values must always be non-zero. * * @ec [in] Reference to entropy collector * @current_delta [in] Jitter time delta * * @return * 0 jitter measurement not stuck (good bit) * 1 jitter measurement stuck (reject bit) */ static int jent_stuck(struct rand_data *ec, __u64 current_delta) { __u64 delta2 = jent_delta(ec->last_delta, current_delta); __u64 delta3 = jent_delta(ec->last_delta2, delta2); ec->last_delta = current_delta; ec->last_delta2 = delta2; /* * Insert the result of the comparison of two back-to-back time * deltas. */ jent_apt_insert(ec, current_delta); if (!current_delta || !delta2 || !delta3) { /* RCT with a stuck bit */ jent_rct_insert(ec, 1); return 1; } /* RCT with a non-stuck bit */ jent_rct_insert(ec, 0); return 0; } /* * Report any health test failures * * @ec [in] Reference to entropy collector * * @return a bitmask indicating which tests failed * 0 No health test failure * 1 RCT failure * 2 APT failure * 1<<JENT_PERMANENT_FAILURE_SHIFT RCT permanent failure * 2<<JENT_PERMANENT_FAILURE_SHIFT APT permanent failure */ static unsigned int jent_health_failure(struct rand_data *ec) { /* Test is only enabled in FIPS mode */ if (!fips_enabled) return 0; return ec->health_failure; } /*************************************************************************** * Noise sources ***************************************************************************/ /* * Update of the loop count used for the next round of * an entropy collection. * * Input: * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; unsigned int i = 0; unsigned int mask = (1<<bits) - 1; jent_get_nstime(&time); /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. */ for (i = 0; ((DATA_SIZE_BITS + bits - 1) / bits) > i; i++) { shuffle ^= time & mask; time = time >> bits; } /* * We add a lower boundary value to ensure we have a minimum * RNG loop count. */ return (shuffle + (1<<min)); } /* * CPU Jitter noise source -- this is the noise source based on the CPU * execution time jitter * * This function injects the individual bits of the time value into the * entropy pool using a hash. * * ec [in] entropy collector * time [in] time stamp to be injected * stuck [in] Is the time stamp identified as stuck? * * Output: * updated hash context in the entropy collector or error code */ static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { #define SHA3_HASH_LOOP (1<<3) struct { int rct_count; unsigned int apt_observations; unsigned int apt_count; unsigned int apt_base; } addtl = { ec->rct_count, ec->apt_observations, ec->apt_count, ec->apt_base }; return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), SHA3_HASH_LOOP, stuck); } /* * Memory Access noise source -- this is a noise source based on variations in * memory access times * * This function performs memory accesses which will add to the timing * variations due to an unknown amount of CPU wait states that need to be * added when accessing memory. The memory size should be larger than the L1 * caches as outlined in the documentation and the associated testing. * * The L1 cache has a very high bandwidth, albeit its access rate is usually * slower than accessing CPU registers. Therefore, L1 accesses only add minimal * variations as the CPU has hardly to wait. Starting with L2, significant * variations are added because L2 typically does not belong to the CPU any more * and therefore a wider range of CPU wait states is necessary for accesses. * L3 and real memory accesses have even a wider range of wait states. However, * to reliably access either L3 or memory, the ec->mem memory must be quite * large which is usually not desirable. * * @ec [in] Reference to the entropy collector with the memory access data -- if * the reference to the memory block to be accessed is NULL, this noise * source is disabled * @loop_cnt [in] if a value not equal to 0 is set, use the given value * number of loops to perform the LFSR */ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) { unsigned int wrap = 0; __u64 i = 0; #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; wrap = ec->memblocksize * ec->memblocks; /* * testing purposes -- allow test app to set the counter, not * needed during runtime */ if (loop_cnt) acc_loop_cnt = loop_cnt; for (i = 0; i < (ec->memaccessloops + acc_loop_cnt); i++) { unsigned char *tmpval = ec->mem + ec->memlocation; /* * memory access: just add 1 to one byte, * wrap at 255 -- memory access implies read * from and write to memory location */ *tmpval = (*tmpval + 1) & 0xff; /* * Addition of memblocksize - 1 to pointer * with wrap around logic to ensure that every * memory location is hit evenly */ ec->memlocation = ec->memlocation + ec->memblocksize - 1; ec->memlocation = ec->memlocation % wrap; } } /*************************************************************************** * Start of entropy processing logic ***************************************************************************/ /* * This is the heart of the entropy generation: calculate time deltas and * use the CPU jitter in the time deltas. The jitter is injected into the * entropy pool. * * WARNING: ensure that ->prev_time is primed before using the output * of this function! This can be done by calling this function * and not using its result. * * @ec [in] Reference to entropy collector * * @return result of stuck test */ static int jent_measure_jitter(struct rand_data *ec, __u64 *ret_current_delta) { __u64 time = 0; __u64 current_delta = 0; int stuck; /* Invoke one noise source before time measurement to add variations */ jent_memaccess(ec, 0); /* * Get time stamp and calculate time delta to previous * invocation to measure the timing variations */ jent_get_nstime(&time); current_delta = jent_delta(ec->prev_time, time); ec->prev_time = time; /* Check whether we have a stuck measurement. */ stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ if (jent_condition_data(ec, current_delta, stuck)) stuck = 1; /* return the raw entropy value */ if (ret_current_delta) *ret_current_delta = current_delta; return stuck; } /* * Generator of one 64 bit random number * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ static void jent_gen_entropy(struct rand_data *ec) { unsigned int k = 0, safety_factor = 0; if (fips_enabled) safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec, NULL); while (!jent_health_failure(ec)) { /* If a stuck measurement is received, repeat measurement */ if (jent_measure_jitter(ec, NULL)) continue; /* * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } /* * Entry function: Obtain entropy for the caller. * * This function invokes the entropy gathering logic as often to generate * as many bytes as requested by the caller. The entropy gathering logic * creates 64 bit per invocation. * * This function truncates the last 64 bit entropy value output to the exact * size specified by the caller. * * @ec [in] Reference to entropy collector * @data [in] pointer to buffer for storing random data -- buffer must already * exist * @len [in] size of the buffer, specifying also the requested number of random * in bytes * * @return 0 when request is fulfilled or an error * * The following error codes can occur: * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len) { unsigned char *p = data; if (!ec) return -1; while (len > 0) { unsigned int tocopy, health_test_result; jent_gen_entropy(ec); health_test_result = jent_health_failure(ec); if (health_test_result > JENT_PERMANENT_FAILURE_SHIFT) { /* * At this point, the Jitter RNG instance is considered * as a failed instance. There is no rerun of the * startup test any more, because the caller * is assumed to not further use this instance. */ return -3; } else if (health_test_result) { /* * Perform startup health tests and return permanent * error if it fails. */ if (jent_entropy_init(0, 0, NULL, ec)) { /* Mark the permanent error */ ec->health_failure &= JENT_RCT_FAILURE_PERMANENT | JENT_APT_FAILURE_PERMANENT; return -3; } return -2; } tocopy = min(DATA_SIZE_BITS / 8, len); if (jent_read_random_block(ec->hash_state, p, tocopy)) return -1; len -= tocopy; p += tocopy; } return 0; } /*************************************************************************** * Initialization logic ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, unsigned int flags, void *hash_state) { struct rand_data *entropy_collector; entropy_collector = jent_zalloc(sizeof(struct rand_data)); if (!entropy_collector) return NULL; if (!(flags & JENT_DISABLE_MEMORY_ACCESS)) { /* Allocate memory for adding variations based on memory * access */ entropy_collector->mem = jent_kvzalloc(JENT_MEMORY_SIZE); if (!entropy_collector->mem) { jent_zfree(entropy_collector); return NULL; } entropy_collector->memblocksize = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE; entropy_collector->memblocks = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS; entropy_collector->memaccessloops = JENT_MEMORY_ACCESSLOOPS; } /* verify and set the oversampling rate */ if (osr == 0) osr = 1; /* H_submitter = 1 / osr */ entropy_collector->osr = osr; entropy_collector->flags = flags; entropy_collector->hash_state = hash_state; /* Initialize the APT */ jent_apt_init(entropy_collector, osr); /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); return entropy_collector; } void jent_entropy_collector_free(struct rand_data *entropy_collector) { jent_kvzfree(entropy_collector->mem, JENT_MEMORY_SIZE); entropy_collector->mem = NULL; jent_zfree(entropy_collector); } int jent_entropy_init(unsigned int osr, unsigned int flags, void *hash_state, struct rand_data *p_ec) { /* * If caller provides an allocated ec, reuse it which implies that the * health test entropy data is used to further still the available * entropy pool. */ struct rand_data *ec = p_ec; int i, time_backwards = 0, ret = 0, ec_free = 0; unsigned int health_test_result; if (!ec) { ec = jent_entropy_collector_alloc(osr, flags, hash_state); if (!ec) return JENT_EMEM; ec_free = 1; } else { /* Reset the APT */ jent_apt_reset(ec, 0); /* Ensure that a new APT base is obtained */ ec->apt_base_set = 0; /* Reset the RCT */ ec->rct_count = 0; /* Reset intermittent, leave permanent health test result */ ec->health_failure &= (~JENT_RCT_FAILURE); ec->health_failure &= (~JENT_APT_FAILURE); } /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These * loop counts may show some slight skew and we produce * false positives. * * Moreover, only old systems show potentially problematic * jitter entropy that could potentially be caught here. But * the RNG is intended for hardware that is available or widely * used, but not old systems that are long out of favor. Thus, * no statistical tests. */ /* * We could add a check for system capabilities such as clock_getres or * check for CONFIG_X86_TSC, but it does not make much sense as the * following sanity checks verify that we have a high-resolution * timer. */ /* * TESTLOOPCOUNT needs some loops to identify edge systems. 100 is * definitely too little. * * SP800-90B requires at least 1024 initial test cycles. */ #define TESTLOOPCOUNT 1024 #define CLEARCACHE 100 for (i = 0; (TESTLOOPCOUNT + CLEARCACHE) > i; i++) { __u64 start_time = 0, end_time = 0, delta = 0; /* Invoke core entropy collection logic */ jent_measure_jitter(ec, &delta); end_time = ec->prev_time; start_time = ec->prev_time - delta; /* test whether timer works */ if (!start_time || !end_time) { ret = JENT_ENOTIME; goto out; } /* * test whether timer is fine grained enough to provide * delta even when called shortly after each other -- this * implies that we also have a high resolution timer */ if (!delta || (end_time == start_time)) { ret = JENT_ECOARSETIME; goto out; } /* * up to here we did not modify any variable that will be * evaluated later, but we already performed some work. Thus we * already have had an impact on the caches, branch prediction, * etc. with the goal to clear it to get the worst case * measurements. */ if (i < CLEARCACHE) continue; /* test whether we have an increasing timer */ if (!(end_time > start_time)) time_backwards++; } /* * we allow up to three times the time running backwards. * CLOCK_REALTIME is affected by adjtime and NTP operations. Thus, * if such an operation just happens to interfere with our test, it * should not fail. The value of 3 should cover the NTP case being * performed during our test run. */ if (time_backwards > 3) { ret = JENT_ENOMONOTONIC; goto out; } /* Did we encounter a health test failure? */ health_test_result = jent_health_failure(ec); if (health_test_result) { ret = (health_test_result & JENT_RCT_FAILURE) ? JENT_ERCT : JENT_EHEALTH; goto out; } out: if (ec_free) jent_entropy_collector_free(ec); return ret; }
1 1 3 3 17 17 2 1 1 1 17 253 135 120 2 30 1 4 25 4 23 18 5 2 3 31 19 5 7 30 6 23 2 1 1 7 100 107 107 18 3 2 1 2 82 5 85 62 29 100 101 182 182 1 20 7 4 1 17 14 9 7 16 229 114 116 111 33 12 1 1 1 5 1 1 1 3 1 56 4 119 54 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 // SPDX-License-Identifier: GPL-2.0-only /* * Changes: * Arnaldo Carvalho de Melo <acme@conectiva.com.br> 08/23/2000 * - get rid of some verify_areas and use __copy*user and __get/put_user * for the ones that remain */ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/interrupt.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/cdrom.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> #include <scsi/scsi_dbg.h> #include "scsi_logging.h" #define NORMAL_RETRIES 5 #define IOCTL_NORMAL_TIMEOUT (10 * HZ) #define MAX_BUF PAGE_SIZE /** * ioctl_probe -- return host identification * @host: host to identify * @buffer: userspace buffer for identification * * Return: * * if successful, %1 and an identifying string at @buffer, if @buffer * is non-NULL, filling to the length stored at * (int *) @buffer. * * <0 error code on failure. */ static int ioctl_probe(struct Scsi_Host *host, void __user *buffer) { unsigned int len, slen; const char *string; if (buffer) { if (get_user(len, (unsigned int __user *) buffer)) return -EFAULT; if (host->hostt->info) string = host->hostt->info(host); else string = host->hostt->name; if (string) { slen = strlen(string); if (len > slen) len = slen + 1; if (copy_to_user(buffer, string, len)) return -EFAULT; } } return 1; } static int ioctl_internal_command(struct scsi_device *sdev, char *cmd, int timeout, int retries) { int result; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; SCSI_LOG_IOCTL(1, sdev_printk(KERN_INFO, sdev, "Trying ioctl with scsi command %d\n", *cmd)); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, retries, &exec_args); SCSI_LOG_IOCTL(2, sdev_printk(KERN_INFO, sdev, "Ioctl returned 0x%x\n", result)); if (result < 0) goto out; if (scsi_sense_valid(&sshdr)) { switch (sshdr.sense_key) { case ILLEGAL_REQUEST: if (cmd[0] == ALLOW_MEDIUM_REMOVAL) sdev->lockable = 0; else sdev_printk(KERN_INFO, sdev, "ioctl_internal_command: " "ILLEGAL REQUEST " "asc=0x%x ascq=0x%x\n", sshdr.asc, sshdr.ascq); break; case NOT_READY: /* This happens if there is no disc in drive */ if (sdev->removable) break; fallthrough; case UNIT_ATTENTION: if (sdev->removable) { sdev->changed = 1; result = 0; /* This is no longer considered an error */ break; } fallthrough; /* for non-removable media */ default: sdev_printk(KERN_INFO, sdev, "ioctl_internal_command return code = %x\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); break; } } out: SCSI_LOG_IOCTL(2, sdev_printk(KERN_INFO, sdev, "IOCTL Releasing command\n")); return result; } /** * scsi_set_medium_removal() - send command to allow or prevent medium removal * @sdev: target scsi device * @state: removal state to set (prevent or allow) * * Returns: * * %0 if @sdev is not removable or not lockable or successful. * * non-%0 is a SCSI result code if > 0 or kernel error code if < 0. * * Sets @sdev->locked to the new state on success. */ int scsi_set_medium_removal(struct scsi_device *sdev, char state) { char scsi_cmd[MAX_COMMAND_SIZE]; int ret; if (!sdev->removable || !sdev->lockable) return 0; scsi_cmd[0] = ALLOW_MEDIUM_REMOVAL; scsi_cmd[1] = 0; scsi_cmd[2] = 0; scsi_cmd[3] = 0; scsi_cmd[4] = state; scsi_cmd[5] = 0; ret = ioctl_internal_command(sdev, scsi_cmd, IOCTL_NORMAL_TIMEOUT, NORMAL_RETRIES); if (ret == 0) sdev->locked = (state == SCSI_REMOVAL_PREVENT); return ret; } EXPORT_SYMBOL(scsi_set_medium_removal); /* * The scsi_ioctl_get_pci() function places into arg the value * pci_dev::slot_name (8 characters) for the PCI device (if any). * Returns: 0 on success * -ENXIO if there isn't a PCI device pointer * (could be because the SCSI driver hasn't been * updated yet, or because it isn't a SCSI * device) * any copy_to_user() error on failure there */ static int scsi_ioctl_get_pci(struct scsi_device *sdev, void __user *arg) { struct device *dev = scsi_get_device(sdev->host); const char *name; if (!dev) return -ENXIO; name = dev_name(dev); /* compatibility with old ioctl which only returned * 20 characters */ return copy_to_user(arg, name, min(strlen(name), (size_t)20)) ? -EFAULT: 0; } static int sg_get_version(int __user *p) { static const int sg_version_num = 30527; return put_user(sg_version_num, p); } static int sg_set_timeout(struct scsi_device *sdev, int __user *p) { int timeout, err = get_user(timeout, p); if (!err) sdev->sg_timeout = clock_t_to_jiffies(timeout); return err; } static int sg_get_reserved_size(struct scsi_device *sdev, int __user *p) { int val = min(sdev->sg_reserved_size, queue_max_bytes(sdev->request_queue)); return put_user(val, p); } static int sg_set_reserved_size(struct scsi_device *sdev, int __user *p) { int size, err = get_user(size, p); if (err) return err; if (size < 0) return -EINVAL; sdev->sg_reserved_size = min_t(unsigned int, size, queue_max_bytes(sdev->request_queue)); return 0; } /* * will always return that we are ATAPI even for a real SCSI drive, I'm not * so sure this is worth doing anything about (why would you care??) */ static int sg_emulated_host(struct request_queue *q, int __user *p) { return put_user(1, p); } static int scsi_get_idlun(struct scsi_device *sdev, void __user *argp) { struct scsi_idlun v = { .dev_id = (sdev->id & 0xff) + ((sdev->lun & 0xff) << 8) + ((sdev->channel & 0xff) << 16) + ((sdev->host->host_no & 0xff) << 24), .host_unique_id = sdev->host->unique_id }; if (copy_to_user(argp, &v, sizeof(struct scsi_idlun))) return -EFAULT; return 0; } static int scsi_send_start_stop(struct scsi_device *sdev, int data) { u8 cdb[MAX_COMMAND_SIZE] = { }; cdb[0] = START_STOP; cdb[4] = data; return ioctl_internal_command(sdev, cdb, START_STOP_TIMEOUT, NORMAL_RETRIES); } /** * scsi_cmd_allowed() - Check if the given command is allowed. * @cmd: SCSI command to check * @open_for_write: is the file / block device opened for writing? * * Only a subset of commands are allowed for unprivileged users. Commands used * to format the media, update the firmware, etc. are not permitted. * * Return: %true if the cmd is allowed, otherwise @false. */ bool scsi_cmd_allowed(unsigned char *cmd, bool open_for_write) { /* root can do any command. */ if (capable(CAP_SYS_RAWIO)) return true; /* Anybody who can open the device can do a read-safe command */ switch (cmd[0]) { /* Basic read-only commands */ case TEST_UNIT_READY: case REQUEST_SENSE: case READ_6: case READ_10: case READ_12: case READ_16: case READ_BUFFER: case READ_DEFECT_DATA: case READ_CAPACITY: /* also GPCMD_READ_CDVD_CAPACITY */ case READ_LONG: case INQUIRY: case MODE_SENSE: case MODE_SENSE_10: case LOG_SENSE: case START_STOP: case GPCMD_VERIFY_10: case VERIFY_16: case REPORT_LUNS: case SERVICE_ACTION_IN_16: case RECEIVE_DIAGNOSTIC: case MAINTENANCE_IN: /* also GPCMD_SEND_KEY, which is a write command */ case GPCMD_READ_BUFFER_CAPACITY: /* Audio CD commands */ case GPCMD_PLAY_CD: case GPCMD_PLAY_AUDIO_10: case GPCMD_PLAY_AUDIO_MSF: case GPCMD_PLAY_AUDIO_TI: case GPCMD_PAUSE_RESUME: /* CD/DVD data reading */ case GPCMD_READ_CD: case GPCMD_READ_CD_MSF: case GPCMD_READ_DISC_INFO: case GPCMD_READ_DVD_STRUCTURE: case GPCMD_READ_HEADER: case GPCMD_READ_TRACK_RZONE_INFO: case GPCMD_READ_SUBCHANNEL: case GPCMD_READ_TOC_PMA_ATIP: case GPCMD_REPORT_KEY: case GPCMD_SCAN: case GPCMD_GET_CONFIGURATION: case GPCMD_READ_FORMAT_CAPACITIES: case GPCMD_GET_EVENT_STATUS_NOTIFICATION: case GPCMD_GET_PERFORMANCE: case GPCMD_SEEK: case GPCMD_STOP_PLAY_SCAN: /* ZBC */ case ZBC_IN: return true; /* Basic writing commands */ case WRITE_6: case WRITE_10: case WRITE_VERIFY: case WRITE_12: case WRITE_VERIFY_12: case WRITE_16: case WRITE_LONG: case WRITE_LONG_2: case WRITE_SAME: case WRITE_SAME_16: case WRITE_SAME_32: case ERASE: case GPCMD_MODE_SELECT_10: case MODE_SELECT: case LOG_SELECT: case GPCMD_BLANK: case GPCMD_CLOSE_TRACK: case GPCMD_FLUSH_CACHE: case GPCMD_FORMAT_UNIT: case GPCMD_REPAIR_RZONE_TRACK: case GPCMD_RESERVE_RZONE_TRACK: case GPCMD_SEND_DVD_STRUCTURE: case GPCMD_SEND_EVENT: case GPCMD_SEND_OPC: case GPCMD_SEND_CUE_SHEET: case GPCMD_SET_SPEED: case GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL: case GPCMD_LOAD_UNLOAD: case GPCMD_SET_STREAMING: case GPCMD_SET_READ_AHEAD: /* ZBC */ case ZBC_OUT: return open_for_write; default: return false; } } EXPORT_SYMBOL(scsi_cmd_allowed); static int scsi_fill_sghdr_rq(struct scsi_device *sdev, struct request *rq, struct sg_io_hdr *hdr, bool open_for_write) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); if (hdr->cmd_len < 6) return -EMSGSIZE; if (copy_from_user(scmd->cmnd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) return -EPERM; scmd->cmd_len = hdr->cmd_len; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) rq->timeout = sdev->sg_timeout; if (!rq->timeout) rq->timeout = BLK_DEFAULT_SG_TIMEOUT; if (rq->timeout < BLK_MIN_SG_TIMEOUT) rq->timeout = BLK_MIN_SG_TIMEOUT; return 0; } static int scsi_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, struct bio *bio) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); int r, ret = 0; /* * fill in all the output members */ hdr->status = scmd->result & 0xff; hdr->masked_status = sg_status_byte(scmd->result); hdr->msg_status = COMMAND_COMPLETE; hdr->host_status = host_byte(scmd->result); hdr->driver_status = 0; if (scsi_status_is_check_condition(hdr->status)) hdr->driver_status = DRIVER_SENSE; hdr->info = 0; if (hdr->masked_status || hdr->host_status || hdr->driver_status) hdr->info |= SG_INFO_CHECK; hdr->resid = scmd->resid_len; hdr->sb_len_wr = 0; if (scmd->sense_len && hdr->sbp) { int len = min((unsigned int) hdr->mx_sb_len, scmd->sense_len); if (!copy_to_user(hdr->sbp, scmd->sense_buffer, len)) hdr->sb_len_wr = len; else ret = -EFAULT; } r = blk_rq_unmap_user(bio); if (!ret) ret = r; return ret; } static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, bool open_for_write) { unsigned long start_time; ssize_t ret = 0; int writing = 0; int at_head = 0; struct request *rq; struct scsi_cmnd *scmd; struct bio *bio; if (hdr->interface_id != 'S') return -EINVAL; if (hdr->dxfer_len > (queue_max_hw_sectors(sdev->request_queue) << 9)) return -EIO; if (hdr->dxfer_len) switch (hdr->dxfer_direction) { default: return -EINVAL; case SG_DXFER_TO_DEV: writing = 1; break; case SG_DXFER_TO_FROM_DEV: case SG_DXFER_FROM_DEV: break; } if (hdr->flags & SG_FLAG_Q_AT_HEAD) at_head = 1; rq = scsi_alloc_request(sdev->request_queue, writing ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); scmd = blk_mq_rq_to_pdu(rq); if (hdr->cmd_len > sizeof(scmd->cmnd)) { ret = -EINVAL; goto out_put_request; } ret = scsi_fill_sghdr_rq(sdev, rq, hdr, open_for_write); if (ret < 0) goto out_put_request; ret = blk_rq_map_user_io(rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL, hdr->iovec_count && hdr->dxfer_len, hdr->iovec_count, 0, rq_data_dir(rq)); if (ret) goto out_put_request; bio = rq->bio; scmd->allowed = 0; start_time = jiffies; blk_execute_rq(rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); ret = scsi_complete_sghdr_rq(rq, hdr, bio); out_put_request: blk_mq_free_request(rq); return ret; } /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl * @q: request queue to send scsi commands down * @open_for_write: is the file / block device opened for writing? * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below * the request queue @q. * * Notes: * - This interface is deprecated - users should use the SG_IO * interface instead, as this is a more flexible approach to * performing SCSI commands on a device. * - The SCSI command length is determined by examining the 1st byte * of the given command. There is no way to override this. * - Data transfers are limited to PAGE_SIZE * - The length (x + y) must be at least OMAX_SB_LEN bytes long to * accommodate the sense buffer when an error occurs. * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that * old code will not be surprised. * - If a Unix error occurs (e.g. ENOMEM) then the user will receive * a negative return and the Unix error code in 'errno'. * If the SCSI command succeeds then 0 is returned. * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ static int sg_scsi_ioctl(struct request_queue *q, bool open_for_write, struct scsi_ioctl_command __user *sic) { struct request *rq; int err; unsigned int in_len, out_len, bytes, opcode, cmdlen; struct scsi_cmnd *scmd; char *buffer = NULL; if (!sic) return -EINVAL; /* * get in an out lengths, verify they don't exceed a page worth of data */ if (get_user(in_len, &sic->inlen)) return -EFAULT; if (get_user(out_len, &sic->outlen)) return -EFAULT; if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) return -EINVAL; if (get_user(opcode, &sic->data[0])) return -EFAULT; bytes = max(in_len, out_len); if (bytes) { buffer = kzalloc(bytes, GFP_NOIO | GFP_USER | __GFP_NOWARN); if (!buffer) return -ENOMEM; } rq = scsi_alloc_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; } scmd = blk_mq_rq_to_pdu(rq); cmdlen = COMMAND_SIZE(opcode); /* * get command and data to send to device, if any */ err = -EFAULT; scmd->cmd_len = cmdlen; if (copy_from_user(scmd->cmnd, sic->data, cmdlen)) goto error; if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; err = -EPERM; if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) goto error; /* default. possible overridden later */ scmd->allowed = 5; switch (opcode) { case SEND_DIAGNOSTIC: case FORMAT_UNIT: rq->timeout = FORMAT_UNIT_TIMEOUT; scmd->allowed = 1; break; case START_STOP: rq->timeout = START_STOP_TIMEOUT; break; case MOVE_MEDIUM: rq->timeout = MOVE_MEDIUM_TIMEOUT; break; case READ_ELEMENT_STATUS: rq->timeout = READ_ELEMENT_STATUS_TIMEOUT; break; case READ_DEFECT_DATA: rq->timeout = READ_DEFECT_DATA_TIMEOUT; scmd->allowed = 1; break; default: rq->timeout = BLK_DEFAULT_SG_TIMEOUT; break; } if (bytes) { err = blk_rq_map_kern(rq, buffer, bytes, GFP_NOIO); if (err) goto error; } blk_execute_rq(rq, false); err = scmd->result & 0xff; /* only 8 bit SCSI status */ if (err) { if (scmd->sense_len && scmd->sense_buffer) { /* limit sense len for backward compatibility */ if (copy_to_user(sic->data, scmd->sense_buffer, min(scmd->sense_len, 16U))) err = -EFAULT; } } else { if (copy_to_user(sic->data, buffer, out_len)) err = -EFAULT; } error: blk_mq_free_request(rq); error_free_buffer: kfree(buffer); return err; } int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_sg_io_hdr hdr32 = { .interface_id = hdr->interface_id, .dxfer_direction = hdr->dxfer_direction, .cmd_len = hdr->cmd_len, .mx_sb_len = hdr->mx_sb_len, .iovec_count = hdr->iovec_count, .dxfer_len = hdr->dxfer_len, .dxferp = (uintptr_t)hdr->dxferp, .cmdp = (uintptr_t)hdr->cmdp, .sbp = (uintptr_t)hdr->sbp, .timeout = hdr->timeout, .flags = hdr->flags, .pack_id = hdr->pack_id, .usr_ptr = (uintptr_t)hdr->usr_ptr, .status = hdr->status, .masked_status = hdr->masked_status, .msg_status = hdr->msg_status, .sb_len_wr = hdr->sb_len_wr, .host_status = hdr->host_status, .driver_status = hdr->driver_status, .resid = hdr->resid, .duration = hdr->duration, .info = hdr->info, }; if (copy_to_user(argp, &hdr32, sizeof(hdr32))) return -EFAULT; return 0; } #endif if (copy_to_user(argp, hdr, sizeof(*hdr))) return -EFAULT; return 0; } EXPORT_SYMBOL(put_sg_io_hdr); int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp) { #ifdef CONFIG_COMPAT struct compat_sg_io_hdr hdr32; if (in_compat_syscall()) { if (copy_from_user(&hdr32, argp, sizeof(hdr32))) return -EFAULT; *hdr = (struct sg_io_hdr) { .interface_id = hdr32.interface_id, .dxfer_direction = hdr32.dxfer_direction, .cmd_len = hdr32.cmd_len, .mx_sb_len = hdr32.mx_sb_len, .iovec_count = hdr32.iovec_count, .dxfer_len = hdr32.dxfer_len, .dxferp = compat_ptr(hdr32.dxferp), .cmdp = compat_ptr(hdr32.cmdp), .sbp = compat_ptr(hdr32.sbp), .timeout = hdr32.timeout, .flags = hdr32.flags, .pack_id = hdr32.pack_id, .usr_ptr = compat_ptr(hdr32.usr_ptr), .status = hdr32.status, .masked_status = hdr32.masked_status, .msg_status = hdr32.msg_status, .sb_len_wr = hdr32.sb_len_wr, .host_status = hdr32.host_status, .driver_status = hdr32.driver_status, .resid = hdr32.resid, .duration = hdr32.duration, .info = hdr32.info, }; return 0; } #endif if (copy_from_user(hdr, argp, sizeof(*hdr))) return -EFAULT; return 0; } EXPORT_SYMBOL(get_sg_io_hdr); #ifdef CONFIG_COMPAT struct compat_cdrom_generic_command { unsigned char cmd[CDROM_PACKET_SIZE]; compat_caddr_t buffer; compat_uint_t buflen; compat_int_t stat; compat_caddr_t sense; unsigned char data_direction; unsigned char pad[3]; compat_int_t quiet; compat_int_t timeout; compat_caddr_t unused; }; #endif static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc, const void __user *arg) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_cdrom_generic_command cgc32; if (copy_from_user(&cgc32, arg, sizeof(cgc32))) return -EFAULT; *cgc = (struct cdrom_generic_command) { .buffer = compat_ptr(cgc32.buffer), .buflen = cgc32.buflen, .stat = cgc32.stat, .sense = compat_ptr(cgc32.sense), .data_direction = cgc32.data_direction, .quiet = cgc32.quiet, .timeout = cgc32.timeout, .unused = compat_ptr(cgc32.unused), }; memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE); return 0; } #endif if (copy_from_user(cgc, arg, sizeof(*cgc))) return -EFAULT; return 0; } static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, void __user *arg) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_cdrom_generic_command cgc32 = { .buffer = (uintptr_t)(cgc->buffer), .buflen = cgc->buflen, .stat = cgc->stat, .sense = (uintptr_t)(cgc->sense), .data_direction = cgc->data_direction, .quiet = cgc->quiet, .timeout = cgc->timeout, .unused = (uintptr_t)(cgc->unused), }; memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE); if (copy_to_user(arg, &cgc32, sizeof(cgc32))) return -EFAULT; return 0; } #endif if (copy_to_user(arg, cgc, sizeof(*cgc))) return -EFAULT; return 0; } static int scsi_cdrom_send_packet(struct scsi_device *sdev, bool open_for_write, void __user *arg) { struct cdrom_generic_command cgc; struct sg_io_hdr hdr; int err; err = scsi_get_cdrom_generic_arg(&cgc, arg); if (err) return err; cgc.timeout = clock_t_to_jiffies(cgc.timeout); memset(&hdr, 0, sizeof(hdr)); hdr.interface_id = 'S'; hdr.cmd_len = sizeof(cgc.cmd); hdr.dxfer_len = cgc.buflen; switch (cgc.data_direction) { case CGC_DATA_UNKNOWN: hdr.dxfer_direction = SG_DXFER_UNKNOWN; break; case CGC_DATA_WRITE: hdr.dxfer_direction = SG_DXFER_TO_DEV; break; case CGC_DATA_READ: hdr.dxfer_direction = SG_DXFER_FROM_DEV; break; case CGC_DATA_NONE: hdr.dxfer_direction = SG_DXFER_NONE; break; default: return -EINVAL; } hdr.dxferp = cgc.buffer; hdr.sbp = cgc.sense; if (hdr.sbp) hdr.mx_sb_len = sizeof(struct request_sense); hdr.timeout = jiffies_to_msecs(cgc.timeout); hdr.cmdp = ((struct cdrom_generic_command __user *) arg)->cmd; hdr.cmd_len = sizeof(cgc.cmd); err = sg_io(sdev, &hdr, open_for_write); if (err == -EFAULT) return -EFAULT; if (hdr.status) return -EIO; cgc.stat = err; cgc.buflen = hdr.resid; if (scsi_put_cdrom_generic_arg(&cgc, arg)) return -EFAULT; return err; } static int scsi_ioctl_sg_io(struct scsi_device *sdev, bool open_for_write, void __user *argp) { struct sg_io_hdr hdr; int error; error = get_sg_io_hdr(&hdr, argp); if (error) return error; error = sg_io(sdev, &hdr, open_for_write); if (error == -EFAULT) return error; if (put_sg_io_hdr(&hdr, argp)) return -EFAULT; return error; } /** * scsi_ioctl - Dispatch ioctl to scsi device * @sdev: scsi device receiving ioctl * @open_for_write: is the file / block device opened for writing? * @cmd: which ioctl is it * @arg: data associated with ioctl * * Description: The scsi_ioctl() function differs from most ioctls in that it * does not take a major/minor number as the dev field. Rather, it takes * a pointer to a &struct scsi_device. * * Return: varies depending on the @cmd */ int scsi_ioctl(struct scsi_device *sdev, bool open_for_write, int cmd, void __user *arg) { struct request_queue *q = sdev->request_queue; struct scsi_sense_hdr sense_hdr; /* Check for deprecated ioctls ... all the ioctls which don't * follow the new unique numbering scheme are deprecated */ switch (cmd) { case SCSI_IOCTL_SEND_COMMAND: case SCSI_IOCTL_TEST_UNIT_READY: case SCSI_IOCTL_BENCHMARK_COMMAND: case SCSI_IOCTL_SYNC: case SCSI_IOCTL_START_UNIT: case SCSI_IOCTL_STOP_UNIT: printk(KERN_WARNING "program %s is using a deprecated SCSI " "ioctl, please convert it to SG_IO\n", current->comm); break; default: break; } switch (cmd) { case SG_GET_VERSION_NUM: return sg_get_version(arg); case SG_SET_TIMEOUT: return sg_set_timeout(sdev, arg); case SG_GET_TIMEOUT: return jiffies_to_clock_t(sdev->sg_timeout); case SG_GET_RESERVED_SIZE: return sg_get_reserved_size(sdev, arg); case SG_SET_RESERVED_SIZE: return sg_set_reserved_size(sdev, arg); case SG_EMULATED_HOST: return sg_emulated_host(q, arg); case SG_IO: return scsi_ioctl_sg_io(sdev, open_for_write, arg); case SCSI_IOCTL_SEND_COMMAND: return sg_scsi_ioctl(q, open_for_write, arg); case CDROM_SEND_PACKET: return scsi_cdrom_send_packet(sdev, open_for_write, arg); case CDROMCLOSETRAY: return scsi_send_start_stop(sdev, 3); case CDROMEJECT: return scsi_send_start_stop(sdev, 2); case SCSI_IOCTL_GET_IDLUN: return scsi_get_idlun(sdev, arg); case SCSI_IOCTL_GET_BUS_NUMBER: return put_user(sdev->host->host_no, (int __user *)arg); case SCSI_IOCTL_PROBE_HOST: return ioctl_probe(sdev->host, arg); case SCSI_IOCTL_DOORLOCK: return scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); case SCSI_IOCTL_DOORUNLOCK: return scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); case SCSI_IOCTL_TEST_UNIT_READY: return scsi_test_unit_ready(sdev, IOCTL_NORMAL_TIMEOUT, NORMAL_RETRIES, &sense_hdr); case SCSI_IOCTL_START_UNIT: return scsi_send_start_stop(sdev, 1); case SCSI_IOCTL_STOP_UNIT: return scsi_send_start_stop(sdev, 0); case SCSI_IOCTL_GET_PCI: return scsi_ioctl_get_pci(sdev, arg); case SG_SCSI_RESET: return scsi_ioctl_reset(sdev, arg); } #ifdef CONFIG_COMPAT if (in_compat_syscall()) { if (!sdev->host->hostt->compat_ioctl) return -EINVAL; return sdev->host->hostt->compat_ioctl(sdev, cmd, arg); } #endif if (!sdev->host->hostt->ioctl) return -EINVAL; return sdev->host->hostt->ioctl(sdev, cmd, arg); } EXPORT_SYMBOL(scsi_ioctl); /** * scsi_ioctl_block_when_processing_errors - prevent commands from being queued * @sdev: target scsi device * @cmd: which ioctl is it * @ndelay: no delay (non-blocking) * * We can process a reset even when a device isn't fully operable. * * Return: %0 on success, <0 error code. */ int scsi_ioctl_block_when_processing_errors(struct scsi_device *sdev, int cmd, bool ndelay) { if (cmd == SG_SCSI_RESET && ndelay) { if (scsi_host_in_recovery(sdev->host)) return -EAGAIN; } else { if (!scsi_block_when_processing_errors(sdev)) return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(scsi_ioctl_block_when_processing_errors);
4 13 212 463 463 635 8 75 578 275 15 65 606 247 440 13 475 478 870 14 590 371 385 10 15 491 582 364 655 232 230 402 6 6 24 19 534 410 141 231 536 302 643 143 122 19 3 6 126 4 111 84 84 85 24 55 747 730 723 115 115 19 44 45 45 45 71 71 71 4 71 71 71 57 58 58 57 352 216 236 85 85 4 4 78 49 167 167 40 338 21 362 197 196 70 4 37 88 59 3 89 90 90 56 56 42 56 1 10 8 3 9 7 8 9 487 33 543 71 4 71 72 53 60 56 12 12 10 66 10 12 12 10 12 28 25 25 26 26 10 26 18 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/seq_file.c * * helper functions for making synthetic files from sequences of records. * initial implementation -- AV, Oct 2001. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/string_helpers.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <asm/page.h> static struct kmem_cache *seq_file_cache __ro_after_init; static void seq_set_overflow(struct seq_file *m) { m->count = m->size; } static void *seq_buf_alloc(unsigned long size) { if (unlikely(size > MAX_RW_COUNT)) return NULL; return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** * seq_open - initialize sequential file * @file: file we initialize * @op: method table describing the sequence * * seq_open() sets @file, associating it with a sequence described * by @op. @op->start() sets the iterator up and returns the first * element of sequence. @op->stop() shuts it down. @op->next() * returns the next element of sequence. @op->show() prints element * into the buffer. In case of error ->start() and ->next() return * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". * Note: seq_open() will allocate a struct seq_file and store its * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { struct seq_file *p; WARN_ON(file->private_data); p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; file->private_data = p; mutex_init(&p->lock); p->op = op; // No refcounting: the lifetime of 'p' is constrained // to the lifetime of the file. p->file = file; /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical * reasons. * * If a client of seq_files a) implements file.write() and b) wishes to * support pwrite() then that client will need to implement its own * file.open() which calls seq_open() and then sets FMODE_PWRITE. */ file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); static int traverse(struct seq_file *m, loff_t offset) { loff_t pos = 0; int error = 0; void *p; m->index = 0; m->count = m->from = 0; if (!offset) return 0; if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) break; error = m->op->show(m, p); if (error < 0) break; if (unlikely(error)) { error = 0; m->count = 0; } if (seq_has_overflowed(m)) goto Eoverflow; p = m->op->next(m, p, &m->index); if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; break; } pos += m->count; m->count = 0; if (pos == offset) break; } m->op->stop(m, p); return error; Eoverflow: m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } /** * seq_read - ->read() method for sequential files. * @file: the file to read from * @buf: the buffer to read to * @size: the maximum number of bytes to read * @ppos: the current position in the file * * Ready-made ->f_op->read() */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = size}; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, file); iov_iter_init(&iter, ITER_DEST, &iov, 1, size); kiocb.ki_pos = *ppos; ret = seq_read_iter(&kiocb, &iter); *ppos = kiocb.ki_pos; return ret; } EXPORT_SYMBOL(seq_read); /* * Ready-made ->f_op->read_iter() */ ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct seq_file *m = iocb->ki_filp->private_data; size_t copied = 0; size_t n; void *p; int err = 0; if (!iov_iter_count(iter)) return 0; mutex_lock(&m->lock); /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } /* Don't assume ki_pos is where we left it */ if (unlikely(iocb->ki_pos != m->read_pos)) { while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ m->read_pos = 0; m->index = 0; m->count = 0; goto Done; } else { m->read_pos = iocb->ki_pos; } } /* grab buffer if we didn't have one */ if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } // something left in the buffer - copy it out first if (m->count) { n = copy_to_iter(m->buf + m->from, m->count, iter); m->count -= n; m->from += n; copied += n; if (m->count) // hadn't managed to copy everything goto Done; } // get a non-empty record in the buffer m->from = 0; p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) // EOF or an error break; err = m->op->show(m, p); if (err < 0) // hard error break; if (unlikely(err)) // ->show() says "skip it" m->count = 0; if (unlikely(!m->count)) { // empty record p = m->op->next(m, p, &m->index); continue; } if (!seq_has_overflowed(m)) // got it goto Fill; // need a bigger buffer m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; p = m->op->start(m, &m->index); } // EOF or an error m->op->stop(m, p); m->count = 0; goto Done; Fill: // one non-empty record is in the buffer; if they want more, // try to fit more in, but in any case we need to advance // the iterator once for every record shown. while (1) { size_t offs = m->count; loff_t pos = m->index; p = m->op->next(m, p, &m->index); if (pos == m->index) { pr_info_ratelimited("buggy .next function %ps did not update position index\n", m->op->next); m->index++; } if (!p || IS_ERR(p)) // no next record for us break; if (m->count >= iov_iter_count(iter)) break; err = m->op->show(m, p); if (err > 0) { // ->show() says "skip it" m->count = offs; } else if (err || seq_has_overflowed(m)) { m->count = offs; break; } } m->op->stop(m, p); n = copy_to_iter(m->buf, m->count, iter); copied += n; m->count -= n; m->from = n; Done: if (unlikely(!copied)) { copied = m->count ? -EFAULT : err; } else { iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); return copied; Enomem: err = -ENOMEM; goto Done; } EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. * @file: the file in question * @offset: new position * @whence: 0 for absolute, 1 for relative position * * Ready-made ->f_op->llseek() */ loff_t seq_lseek(struct file *file, loff_t offset, int whence) { struct seq_file *m = file->private_data; loff_t retval = -EINVAL; mutex_lock(&m->lock); switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset < 0) break; retval = offset; if (offset != m->read_pos) { while ((retval = traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; m->index = 0; m->count = 0; } else { m->read_pos = offset; retval = file->f_pos = offset; } } else { file->f_pos = offset; } } mutex_unlock(&m->lock); return retval; } EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. * @inode: its inode * @file: file in question * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. */ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); /** * seq_escape_mem - print data into buffer, escaping some characters * @m: target buffer * @src: source buffer * @len: size of source buffer * @flags: flags to pass to string_escape_mem() * @esc: set of characters that need escaping * * Puts data into buffer, replacing each occurrence of character from * given class (defined by @flags and @esc) with printable escaped sequence. * * Use seq_has_overflowed() to check for errors. */ void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int ret; ret = string_escape_mem(src, len, buf, size, flags, esc); seq_commit(m, ret < size ? ret : -1); } EXPORT_SYMBOL(seq_escape_mem); void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; if (m->count < m->size) { len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_vprintf); void seq_printf(struct seq_file *m, const char *f, ...) { va_list args; va_start(args, f); seq_vprintf(m, f, args); va_end(args); } EXPORT_SYMBOL(seq_printf); #ifdef CONFIG_BINARY_PRINTF void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary) { int len; if (m->count < m->size) { len = bstr_printf(m->buf + m->count, m->size - m->count, f, binary); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_bprintf); #endif /* CONFIG_BINARY_PRINTF */ /** * mangle_path - mangle and copy path to buffer beginning * @s: buffer start * @p: beginning of path in above buffer * @esc: set of characters that need escaping * * Copy the path from @p to @s, replacing each occurrence of character from * @esc with usual octal escape. * Returns pointer past last written character in @s, or NULL in case of * failure. */ char *mangle_path(char *s, const char *p, const char *esc) { while (s <= p) { char c = *p++; if (!c) { return s; } else if (!strchr(esc, c)) { *s++ = c; } else if (s + 4 > p) { break; } else { *s++ = '\\'; *s++ = '0' + ((c & 0300) >> 6); *s++ = '0' + ((c & 070) >> 3); *s++ = '0' + (c & 07); } } return NULL; } EXPORT_SYMBOL(mangle_path); /** * seq_path - seq_file interface to print a pathname * @m: the seq_file handle * @path: the struct path to print * @esc: set of characters to escape in the output * * return the absolute path of 'path', as represented by the * dentry / mnt pair in the path parameter. */ int seq_path(struct seq_file *m, const struct path *path, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = d_path(path, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_path); /** * seq_file_path - seq_file interface to print a pathname of a file * @m: the seq_file handle * @file: the struct file to print * @esc: set of characters to escape in the output * * return the absolute path to the file. */ int seq_file_path(struct seq_file *m, struct file *file, const char *esc) { return seq_path(m, &file->f_path, esc); } EXPORT_SYMBOL(seq_file_path); /* * Same as seq_path, but relative to supplied root. */ int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -ENAMETOOLONG; if (size) { char *p; p = __d_path(path, root, buf, size); if (!p) return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; else res = -ENAMETOOLONG; } } seq_commit(m, res); return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* * returns the path of the 'dentry' from the root of its filesystem. */ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = dentry_path(dentry, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_dentry); void *single_start(struct seq_file *p, loff_t *pos) { return *pos ? NULL : SEQ_START_TOKEN; } static void *single_next(struct seq_file *p, void *v, loff_t *pos) { ++*pos; return NULL; } static void single_stop(struct seq_file *p, void *v) { } int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { op->start = single_start; op->next = single_next; op->stop = single_stop; op->show = show; res = seq_open(file, op); if (!res) ((struct seq_file *)file->private_data)->private = data; else kfree(op); } return res; } EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; ((struct seq_file *)file->private_data)->size = size; return 0; } EXPORT_SYMBOL(single_open_size); int single_release(struct inode *inode, struct file *file) { const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; int res = seq_release(inode, file); kfree(op); return res; } EXPORT_SYMBOL(single_release); int seq_release_private(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); seq->private = NULL; return seq_release(inode, file); } EXPORT_SYMBOL(seq_release_private); void *__seq_open_private(struct file *f, const struct seq_operations *ops, int psize) { int rc; void *private; struct seq_file *seq; private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; rc = seq_open(f, ops); if (rc < 0) goto out_free; seq = f->private_data; seq->private = private; return private; out_free: kfree(private); out: return NULL; } EXPORT_SYMBOL(__seq_open_private); int seq_open_private(struct file *filp, const struct seq_operations *ops, int psize) { return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM; } EXPORT_SYMBOL(seq_open_private); void seq_putc(struct seq_file *m, char c) { if (m->count >= m->size) return; m->buf[m->count++] = c; } EXPORT_SYMBOL(seq_putc); void __seq_puts(struct seq_file *m, const char *s) { seq_write(m, s, strlen(s)); } EXPORT_SYMBOL(__seq_puts); /** * seq_put_decimal_ull_width - A helper routine for putting decimal numbers * without rich format of printf(). * only 'unsigned long long' is supported. * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @num: the number * @width: a minimum field width * * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width) { int len; if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (!width) width = 1; if (m->count + width >= m->size) goto overflow; len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num) { return seq_put_decimal_ull_width(m, delimiter, num, 0); } EXPORT_SYMBOL(seq_put_decimal_ull); /** * seq_put_hex_ll - put a number in hexadecimal notation * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @v: the number * @width: a minimum field width * * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) * * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width) { unsigned int len; int i; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } /* If x is 0, the result of __builtin_clzll is undefined */ if (v == 0) len = 1; else len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; if (len < width) len = width; if (m->count + len > m->size) { seq_set_overflow(m); return; } for (i = len - 1; i >= 0; i--) { m->buf[m->count + i] = hex_asc[0xf & v]; v = v >> 4; } m->count += len; } void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (m->count + 2 >= m->size) goto overflow; if (num < 0) { m->buf[m->count++] = '-'; num = -num; } if (num < 10) { m->buf[m->count++] = num + '0'; return; } len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } EXPORT_SYMBOL(seq_put_decimal_ll); /** * seq_write - write arbitrary data to buffer * @seq: seq_file identifying the buffer to which data should be written * @data: data address * @len: number of bytes * * Return 0 on success, non-zero otherwise. */ int seq_write(struct seq_file *seq, const void *data, size_t len) { if (seq->count + len < seq->size) { memcpy(seq->buf + seq->count, data, len); seq->count += len; return 0; } seq_set_overflow(seq); return -1; } EXPORT_SYMBOL(seq_write); /** * seq_pad - write padding spaces to buffer * @m: seq_file identifying the buffer to which data should be written * @c: the byte to append after padding if non-zero */ void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; if (size > 0) { if (size + m->count > m->size) { seq_set_overflow(m); return; } memset(m->buf + m->count, ' ', size); m->count += size; } if (c) seq_putc(m, c); } EXPORT_SYMBOL(seq_pad); /* A complete analogue of print_hex_dump() */ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; char *buffer; size_t size; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; switch (prefix_type) { case DUMP_PREFIX_ADDRESS: seq_printf(m, "%s%p: ", prefix_str, ptr + i); break; case DUMP_PREFIX_OFFSET: seq_printf(m, "%s%.8x: ", prefix_str, i); break; default: seq_printf(m, "%s", prefix_str); break; } size = seq_get_buf(m, &buffer); ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, buffer, size, ascii); seq_commit(m, ret < size ? ret : -1); seq_putc(m, '\n'); } } EXPORT_SYMBOL(seq_hex_dump); struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start); struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head); struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = ((struct list_head *)v)->next; ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next); struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each_rcu(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start_rcu); struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head_rcu); struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = list_next_rcu((struct list_head *)v); ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next_rcu); /** * seq_hlist_start - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). */ struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) { struct hlist_node *node; hlist_for_each(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start); /** * seq_hlist_start_head - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. */ struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head); /** * seq_hlist_next - move to the next position of the hlist * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). */ struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return head->first; else return node->next; } EXPORT_SYMBOL(seq_hlist_next); /** * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos) { struct hlist_node *node; __hlist_for_each_rcu(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start_rcu); /** * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head_rcu); /** * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return rcu_dereference(head->first); else return rcu_dereference(node->next); } EXPORT_SYMBOL(seq_hlist_next_rcu); /** * seq_hlist_start_percpu - start an iteration of a percpu hlist array * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->start(). */ struct hlist_node * seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos) { struct hlist_node *node; for_each_possible_cpu(*cpu) { hlist_for_each(node, per_cpu_ptr(head, *cpu)) { if (pos-- == 0) return node; } } return NULL; } EXPORT_SYMBOL(seq_hlist_start_percpu); /** * seq_hlist_next_percpu - move to the next position of the percpu hlist array * @v: pointer to current hlist_node * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->next(). */ struct hlist_node * seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos) { struct hlist_node *node = v; ++*pos; if (node->next) return node->next; for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids; *cpu = cpumask_next(*cpu, cpu_possible_mask)) { struct hlist_head *bucket = per_cpu_ptr(head, *cpu); if (!hlist_empty(bucket)) return bucket->first; } return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); void __init seq_file_init(void) { seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); }
18 22 22 10 18 18 15 15 15 15 15 18 18 18 18 15 14 2 15 15 15 14 15 15 15 15 15 15 15 15 18 15 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 // SPDX-License-Identifier: GPL-2.0-or-later /* * RTC class driver for "CMOS RTC": PCs, ACPI, etc * * Copyright (C) 1996 Paul Gortmaker (drivers/char/rtc.c) * Copyright (C) 2006 David Brownell (convert to new framework) */ /* * The original "cmos clock" chip was an MC146818 chip, now obsolete. * That defined the register interface now provided by all PCs, some * non-PC systems, and incorporated into ACPI. Modern PC chipsets * integrate an MC146818 clone in their southbridge, and boards use * that instead of discrete clones like the DS12887 or M48T86. There * are also clones that connect using the LPC bus. * * That register API is also used directly by various other drivers * (notably for integrated NVRAM), infrastructure (x86 has code to * bypass the RTC framework, directly reading the RTC during boot * and updating minutes/seconds for systems using NTP synch) and * utilities (like userspace 'hwclock', if no /dev node exists). * * So **ALL** calls to CMOS_READ and CMOS_WRITE must be done with * interrupts disabled, holding the global rtc_lock, to exclude those * other drivers and utilities on correctly configured systems. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/platform_device.h> #include <linux/log2.h> #include <linux/pm.h> #include <linux/of.h> #include <linux/of_platform.h> #ifdef CONFIG_X86 #include <asm/i8259.h> #include <asm/processor.h> #include <linux/dmi.h> #endif /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */ #include <linux/mc146818rtc.h> #ifdef CONFIG_ACPI /* * Use ACPI SCI to replace HPET interrupt for RTC Alarm event * * If cleared, ACPI SCI is only used to wake up the system from suspend * * If set, ACPI SCI is used to handle UIE/AIE and system wakeup */ static bool use_acpi_alarm; module_param(use_acpi_alarm, bool, 0444); static inline int cmos_use_acpi_alarm(void) { return use_acpi_alarm; } #else /* !CONFIG_ACPI */ static inline int cmos_use_acpi_alarm(void) { return 0; } #endif struct cmos_rtc { struct rtc_device *rtc; struct device *dev; int irq; struct resource *iomem; time64_t alarm_expires; void (*wake_on)(struct device *); void (*wake_off)(struct device *); u8 enabled_wake; u8 suspend_ctrl; /* newer hardware extends the original register set */ u8 day_alrm; u8 mon_alrm; u8 century; struct rtc_wkalrm saved_wkalrm; }; /* both platform and pnp busses use negative numbers for invalid irqs */ #define is_valid_irq(n) ((n) > 0) static const char driver_name[] = "rtc_cmos"; /* The RTC_INTR register may have e.g. RTC_PF set even if RTC_PIE is clear; * always mask it against the irq enable bits in RTC_CONTROL. Bit values * are the same: PF==PIE, AF=AIE, UF=UIE; so RTC_IRQMASK works with both. */ #define RTC_IRQMASK (RTC_PF | RTC_AF | RTC_UF) static inline int is_intr(u8 rtc_intr) { if (!(rtc_intr & RTC_IRQF)) return 0; return rtc_intr & RTC_IRQMASK; } /*----------------------------------------------------------------*/ /* Much modern x86 hardware has HPETs (10+ MHz timers) which, because * many BIOS programmers don't set up "sane mode" IRQ routing, are mostly * used in a broken "legacy replacement" mode. The breakage includes * HPET #1 hijacking the IRQ for this RTC, and being unavailable for * other (better) use. * * When that broken mode is in use, platform glue provides a partial * emulation of hardware RTC IRQ facilities using HPET #1. We don't * want to use HPET for anything except those IRQs though... */ #ifdef CONFIG_HPET_EMULATE_RTC #include <asm/hpet.h> #else static inline int is_hpet_enabled(void) { return 0; } static inline int hpet_mask_rtc_irq_bit(unsigned long mask) { return 0; } static inline int hpet_set_rtc_irq_bit(unsigned long mask) { return 0; } static inline int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) { return 0; } static inline int hpet_set_periodic_freq(unsigned long freq) { return 0; } static inline int hpet_rtc_timer_init(void) { return 0; } extern irq_handler_t hpet_rtc_interrupt; static inline int hpet_register_irq_handler(irq_handler_t handler) { return 0; } static inline int hpet_unregister_irq_handler(irq_handler_t handler) { return 0; } #endif /* Don't use HPET for RTC Alarm event if ACPI Fixed event is used */ static inline int use_hpet_alarm(void) { return is_hpet_enabled() && !cmos_use_acpi_alarm(); } /*----------------------------------------------------------------*/ #ifdef RTC_PORT /* Most newer x86 systems have two register banks, the first used * for RTC and NVRAM and the second only for NVRAM. Caller must * own rtc_lock ... and we won't worry about access during NMI. */ #define can_bank2 true static inline unsigned char cmos_read_bank2(unsigned char addr) { outb(addr, RTC_PORT(2)); return inb(RTC_PORT(3)); } static inline void cmos_write_bank2(unsigned char val, unsigned char addr) { outb(addr, RTC_PORT(2)); outb(val, RTC_PORT(3)); } #else #define can_bank2 false static inline unsigned char cmos_read_bank2(unsigned char addr) { return 0; } static inline void cmos_write_bank2(unsigned char val, unsigned char addr) { } #endif /*----------------------------------------------------------------*/ static int cmos_read_time(struct device *dev, struct rtc_time *t) { int ret; /* * If pm_trace abused the RTC for storage, set the timespec to 0, * which tells the caller that this RTC value is unusable. */ if (!pm_trace_rtc_valid()) return -EIO; ret = mc146818_get_time(t, 1000); if (ret < 0) { dev_err_ratelimited(dev, "unable to read current time\n"); return ret; } return 0; } static int cmos_set_time(struct device *dev, struct rtc_time *t) { /* NOTE: this ignores the issue whereby updating the seconds * takes effect exactly 500ms after we write the register. * (Also queueing and other delays before we get this far.) */ return mc146818_set_time(t); } struct cmos_read_alarm_callback_param { struct cmos_rtc *cmos; struct rtc_time *time; unsigned char rtc_control; }; static void cmos_read_alarm_callback(unsigned char __always_unused seconds, void *param_in) { struct cmos_read_alarm_callback_param *p = (struct cmos_read_alarm_callback_param *)param_in; struct rtc_time *time = p->time; time->tm_sec = CMOS_READ(RTC_SECONDS_ALARM); time->tm_min = CMOS_READ(RTC_MINUTES_ALARM); time->tm_hour = CMOS_READ(RTC_HOURS_ALARM); if (p->cmos->day_alrm) { /* ignore upper bits on readback per ACPI spec */ time->tm_mday = CMOS_READ(p->cmos->day_alrm) & 0x3f; if (!time->tm_mday) time->tm_mday = -1; if (p->cmos->mon_alrm) { time->tm_mon = CMOS_READ(p->cmos->mon_alrm); if (!time->tm_mon) time->tm_mon = -1; } } p->rtc_control = CMOS_READ(RTC_CONTROL); } static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) { struct cmos_rtc *cmos = dev_get_drvdata(dev); struct cmos_read_alarm_callback_param p = { .cmos = cmos, .time = &t->time, }; /* This not only a rtc_op, but also called directly */ if (!is_valid_irq(cmos->irq)) return -ETIMEDOUT; /* Basic alarms only support hour, minute, and seconds fields. * Some also support day and month, for alarms up to a year in * the future. */ /* Some Intel chipsets disconnect the alarm registers when the clock * update is in progress - during this time reads return bogus values * and writes may fail silently. See for example "7th Generation Intel® * Processor Family I/O for U/Y Platforms [...] Datasheet", section * 27.7.1 * * Use the mc146818_avoid_UIP() function to avoid this. */ if (!mc146818_avoid_UIP(cmos_read_alarm_callback, 10, &p)) return -EIO; if (!(p.rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { if (((unsigned)t->time.tm_sec) < 0x60) t->time.tm_sec = bcd2bin(t->time.tm_sec); else t->time.tm_sec = -1; if (((unsigned)t->time.tm_min) < 0x60) t->time.tm_min = bcd2bin(t->time.tm_min); else t->time.tm_min = -1; if (((unsigned)t->time.tm_hour) < 0x24) t->time.tm_hour = bcd2bin(t->time.tm_hour); else t->time.tm_hour = -1; if (cmos->day_alrm) { if (((unsigned)t->time.tm_mday) <= 0x31) t->time.tm_mday = bcd2bin(t->time.tm_mday); else t->time.tm_mday = -1; if (cmos->mon_alrm) { if (((unsigned)t->time.tm_mon) <= 0x12) t->time.tm_mon = bcd2bin(t->time.tm_mon)-1; else t->time.tm_mon = -1; } } } t->enabled = !!(p.rtc_control & RTC_AIE); t->pending = 0; return 0; } static void cmos_checkintr(struct cmos_rtc *cmos, unsigned char rtc_control) { unsigned char rtc_intr; /* NOTE after changing RTC_xIE bits we always read INTR_FLAGS; * allegedly some older rtcs need that to handle irqs properly */ rtc_intr = CMOS_READ(RTC_INTR_FLAGS); if (use_hpet_alarm()) return; rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF; if (is_intr(rtc_intr)) rtc_update_irq(cmos->rtc, 1, rtc_intr); } static void cmos_irq_enable(struct cmos_rtc *cmos, unsigned char mask) { unsigned char rtc_control; /* flush any pending IRQ status, notably for update irqs, * before we enable new IRQs */ rtc_control = CMOS_READ(RTC_CONTROL); cmos_checkintr(cmos, rtc_control); rtc_control |= mask; CMOS_WRITE(rtc_control, RTC_CONTROL); if (use_hpet_alarm()) hpet_set_rtc_irq_bit(mask); if ((mask & RTC_AIE) && cmos_use_acpi_alarm()) { if (cmos->wake_on) cmos->wake_on(cmos->dev); } cmos_checkintr(cmos, rtc_control); } static void cmos_irq_disable(struct cmos_rtc *cmos, unsigned char mask) { unsigned char rtc_control; rtc_control = CMOS_READ(RTC_CONTROL); rtc_control &= ~mask; CMOS_WRITE(rtc_control, RTC_CONTROL); if (use_hpet_alarm()) hpet_mask_rtc_irq_bit(mask); if ((mask & RTC_AIE) && cmos_use_acpi_alarm()) { if (cmos->wake_off) cmos->wake_off(cmos->dev); } cmos_checkintr(cmos, rtc_control); } static int cmos_validate_alarm(struct device *dev, struct rtc_wkalrm *t) { struct cmos_rtc *cmos = dev_get_drvdata(dev); struct rtc_time now; cmos_read_time(dev, &now); if (!cmos->day_alrm) { time64_t t_max_date; time64_t t_alrm; t_max_date = rtc_tm_to_time64(&now); t_max_date += 24 * 60 * 60 - 1; t_alrm = rtc_tm_to_time64(&t->time); if (t_alrm > t_max_date) { dev_err(dev, "Alarms can be up to one day in the future\n"); return -EINVAL; } } else if (!cmos->mon_alrm) { struct rtc_time max_date = now; time64_t t_max_date; time64_t t_alrm; int max_mday; if (max_date.tm_mon == 11) { max_date.tm_mon = 0; max_date.tm_year += 1; } else { max_date.tm_mon += 1; } max_mday = rtc_month_days(max_date.tm_mon, max_date.tm_year); if (max_date.tm_mday > max_mday) max_date.tm_mday = max_mday; t_max_date = rtc_tm_to_time64(&max_date); t_max_date -= 1; t_alrm = rtc_tm_to_time64(&t->time); if (t_alrm > t_max_date) { dev_err(dev, "Alarms can be up to one month in the future\n"); return -EINVAL; } } else { struct rtc_time max_date = now; time64_t t_max_date; time64_t t_alrm; int max_mday; max_date.tm_year += 1; max_mday = rtc_month_days(max_date.tm_mon, max_date.tm_year); if (max_date.tm_mday > max_mday) max_date.tm_mday = max_mday; t_max_date = rtc_tm_to_time64(&max_date); t_max_date -= 1; t_alrm = rtc_tm_to_time64(&t->time); if (t_alrm > t_max_date) { dev_err(dev, "Alarms can be up to one year in the future\n"); return -EINVAL; } } return 0; } struct cmos_set_alarm_callback_param { struct cmos_rtc *cmos; unsigned char mon, mday, hrs, min, sec; struct rtc_wkalrm *t; }; /* Note: this function may be executed by mc146818_avoid_UIP() more then * once */ static void cmos_set_alarm_callback(unsigned char __always_unused seconds, void *param_in) { struct cmos_set_alarm_callback_param *p = (struct cmos_set_alarm_callback_param *)param_in; /* next rtc irq must not be from previous alarm setting */ cmos_irq_disable(p->cmos, RTC_AIE); /* update alarm */ CMOS_WRITE(p->hrs, RTC_HOURS_ALARM); CMOS_WRITE(p->min, RTC_MINUTES_ALARM); CMOS_WRITE(p->sec, RTC_SECONDS_ALARM); /* the system may support an "enhanced" alarm */ if (p->cmos->day_alrm) { CMOS_WRITE(p->mday, p->cmos->day_alrm); if (p->cmos->mon_alrm) CMOS_WRITE(p->mon, p->cmos->mon_alrm); } if (use_hpet_alarm()) { /* * FIXME the HPET alarm glue currently ignores day_alrm * and mon_alrm ... */ hpet_set_alarm_time(p->t->time.tm_hour, p->t->time.tm_min, p->t->time.tm_sec); } if (p->t->enabled) cmos_irq_enable(p->cmos, RTC_AIE); } static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) { struct cmos_rtc *cmos = dev_get_drvdata(dev); struct cmos_set_alarm_callback_param p = { .cmos = cmos, .t = t }; unsigned char rtc_control; int ret; /* This not only a rtc_op, but also called directly */ if (!is_valid_irq(cmos->irq)) return -EIO; ret = cmos_validate_alarm(dev, t); if (ret < 0) return ret; p.mon = t->time.tm_mon + 1; p.mday = t->time.tm_mday; p.hrs = t->time.tm_hour; p.min = t->time.tm_min; p.sec = t->time.tm_sec; spin_lock_irq(&rtc_lock); rtc_control = CMOS_READ(RTC_CONTROL); spin_unlock_irq(&rtc_lock); if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { /* Writing 0xff means "don't care" or "match all". */ p.mon = (p.mon <= 12) ? bin2bcd(p.mon) : 0xff; p.mday = (p.mday >= 1 && p.mday <= 31) ? bin2bcd(p.mday) : 0xff; p.hrs = (p.hrs < 24) ? bin2bcd(p.hrs) : 0xff; p.min = (p.min < 60) ? bin2bcd(p.min) : 0xff; p.sec = (p.sec < 60) ? bin2bcd(p.sec) : 0xff; } /* * Some Intel chipsets disconnect the alarm registers when the clock * update is in progress - during this time writes fail silently. * * Use mc146818_avoid_UIP() to avoid this. */ if (!mc146818_avoid_UIP(cmos_set_alarm_callback, 10, &p)) return -ETIMEDOUT; cmos->alarm_expires = rtc_tm_to_time64(&t->time); return 0; } static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct cmos_rtc *cmos = dev_get_drvdata(dev); unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); if (enabled) cmos_irq_enable(cmos, RTC_AIE); else cmos_irq_disable(cmos, RTC_AIE); spin_unlock_irqrestore(&rtc_lock, flags); return 0; } #if IS_ENABLED(CONFIG_RTC_INTF_PROC) static int cmos_procfs(struct device *dev, struct seq_file *seq) { struct cmos_rtc *cmos = dev_get_drvdata(dev); unsigned char rtc_control, valid; spin_lock_irq(&rtc_lock); rtc_control = CMOS_READ(RTC_CONTROL); valid = CMOS_READ(RTC_VALID); spin_unlock_irq(&rtc_lock); /* NOTE: at least ICH6 reports battery status using a different * (non-RTC) bit; and SQWE is ignored on many current systems. */ seq_printf(seq, "periodic_IRQ\t: %s\n" "update_IRQ\t: %s\n" "HPET_emulated\t: %s\n" // "square_wave\t: %s\n" "BCD\t\t: %s\n" "DST_enable\t: %s\n" "periodic_freq\t: %d\n" "batt_status\t: %s\n", (rtc_control & RTC_PIE) ? "yes" : "no", (rtc_control & RTC_UIE) ? "yes" : "no", use_hpet_alarm() ? "yes" : "no", // (rtc_control & RTC_SQWE) ? "yes" : "no", (rtc_control & RTC_DM_BINARY) ? "no" : "yes", (rtc_control & RTC_DST_EN) ? "yes" : "no", cmos->rtc->irq_freq, (valid & RTC_VRT) ? "okay" : "dead"); return 0; } #else #define cmos_procfs NULL #endif static const struct rtc_class_ops cmos_rtc_ops = { .read_time = cmos_read_time, .set_time = cmos_set_time, .read_alarm = cmos_read_alarm, .set_alarm = cmos_set_alarm, .proc = cmos_procfs, .alarm_irq_enable = cmos_alarm_irq_enable, }; /*----------------------------------------------------------------*/ /* * All these chips have at least 64 bytes of address space, shared by * RTC registers and NVRAM. Most of those bytes of NVRAM are used * by boot firmware. Modern chips have 128 or 256 bytes. */ #define NVRAM_OFFSET (RTC_REG_D + 1) static int cmos_nvram_read(void *priv, unsigned int off, void *val, size_t count) { unsigned char *buf = val; off += NVRAM_OFFSET; for (; count; count--, off++, buf++) { guard(spinlock_irq)(&rtc_lock); if (off < 128) *buf = CMOS_READ(off); else if (can_bank2) *buf = cmos_read_bank2(off); else return -EIO; } return 0; } static int cmos_nvram_write(void *priv, unsigned int off, void *val, size_t count) { struct cmos_rtc *cmos = priv; unsigned char *buf = val; /* NOTE: on at least PCs and Ataris, the boot firmware uses a * checksum on part of the NVRAM data. That's currently ignored * here. If userspace is smart enough to know what fields of * NVRAM to update, updating checksums is also part of its job. */ off += NVRAM_OFFSET; for (; count; count--, off++, buf++) { /* don't trash RTC registers */ if (off == cmos->day_alrm || off == cmos->mon_alrm || off == cmos->century) continue; guard(spinlock_irq)(&rtc_lock); if (off < 128) CMOS_WRITE(*buf, off); else if (can_bank2) cmos_write_bank2(*buf, off); else return -EIO; } return 0; } /*----------------------------------------------------------------*/ static struct cmos_rtc cmos_rtc; static irqreturn_t cmos_interrupt(int irq, void *p) { u8 irqstat; u8 rtc_control; unsigned long flags; /* We cannot use spin_lock() here, as cmos_interrupt() is also called * in a non-irq context. */ spin_lock_irqsave(&rtc_lock, flags); /* When the HPET interrupt handler calls us, the interrupt * status is passed as arg1 instead of the irq number. But * always clear irq status, even when HPET is in the way. * * Note that HPET and RTC are almost certainly out of phase, * giving different IRQ status ... */ irqstat = CMOS_READ(RTC_INTR_FLAGS); rtc_control = CMOS_READ(RTC_CONTROL); if (use_hpet_alarm()) irqstat = (unsigned long)irq & 0xF0; /* If we were suspended, RTC_CONTROL may not be accurate since the * bios may have cleared it. */ if (!cmos_rtc.suspend_ctrl) irqstat &= (rtc_control & RTC_IRQMASK) | RTC_IRQF; else irqstat &= (cmos_rtc.suspend_ctrl & RTC_IRQMASK) | RTC_IRQF; /* All Linux RTC alarms should be treated as if they were oneshot. * Similar code may be needed in system wakeup paths, in case the * alarm woke the system. */ if (irqstat & RTC_AIE) { cmos_rtc.suspend_ctrl &= ~RTC_AIE; rtc_control &= ~RTC_AIE; CMOS_WRITE(rtc_control, RTC_CONTROL); if (use_hpet_alarm()) hpet_mask_rtc_irq_bit(RTC_AIE); CMOS_READ(RTC_INTR_FLAGS); } spin_unlock_irqrestore(&rtc_lock, flags); if (is_intr(irqstat)) { rtc_update_irq(p, 1, irqstat); return IRQ_HANDLED; } else return IRQ_NONE; } #ifdef CONFIG_ACPI #include <linux/acpi.h> static u32 rtc_handler(void *context) { struct device *dev = context; struct cmos_rtc *cmos = dev_get_drvdata(dev); unsigned char rtc_control = 0; unsigned char rtc_intr; unsigned long flags; /* * Always update rtc irq when ACPI is used as RTC Alarm. * Or else, ACPI SCI is enabled during suspend/resume only, * update rtc irq in that case. */ if (cmos_use_acpi_alarm()) cmos_interrupt(0, (void *)cmos->rtc); else { /* Fix me: can we use cmos_interrupt() here as well? */ spin_lock_irqsave(&rtc_lock, flags); if (cmos_rtc.suspend_ctrl) rtc_control = CMOS_READ(RTC_CONTROL); if (rtc_control & RTC_AIE) { cmos_rtc.suspend_ctrl &= ~RTC_AIE; CMOS_WRITE(rtc_control, RTC_CONTROL); rtc_intr = CMOS_READ(RTC_INTR_FLAGS); rtc_update_irq(cmos->rtc, 1, rtc_intr); } spin_unlock_irqrestore(&rtc_lock, flags); } pm_wakeup_hard_event(dev); acpi_clear_event(ACPI_EVENT_RTC); acpi_disable_event(ACPI_EVENT_RTC, 0); return ACPI_INTERRUPT_HANDLED; } static void acpi_rtc_event_setup(struct device *dev) { if (acpi_disabled) return; acpi_install_fixed_event_handler(ACPI_EVENT_RTC, rtc_handler, dev); /* * After the RTC handler is installed, the Fixed_RTC event should * be disabled. Only when the RTC alarm is set will it be enabled. */ acpi_clear_event(ACPI_EVENT_RTC); acpi_disable_event(ACPI_EVENT_RTC, 0); } static void acpi_rtc_event_cleanup(void) { if (acpi_disabled) return; acpi_remove_fixed_event_handler(ACPI_EVENT_RTC, rtc_handler); } static void rtc_wake_on(struct device *dev) { acpi_clear_event(ACPI_EVENT_RTC); acpi_enable_event(ACPI_EVENT_RTC, 0); } static void rtc_wake_off(struct device *dev) { acpi_disable_event(ACPI_EVENT_RTC, 0); } #ifdef CONFIG_X86 static void use_acpi_alarm_quirks(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: if (dmi_get_bios_year() < 2015) return; break; case X86_VENDOR_AMD: case X86_VENDOR_HYGON: if (dmi_get_bios_year() < 2021) return; break; default: return; } if (!is_hpet_enabled()) return; use_acpi_alarm = true; } #else static inline void use_acpi_alarm_quirks(void) { } #endif static void acpi_cmos_wake_setup(struct device *dev) { if (acpi_disabled) return; use_acpi_alarm_quirks(); cmos_rtc.wake_on = rtc_wake_on; cmos_rtc.wake_off = rtc_wake_off; /* ACPI tables bug workaround. */ if (acpi_gbl_FADT.month_alarm && !acpi_gbl_FADT.day_alarm) { dev_dbg(dev, "bogus FADT month_alarm (%d)\n", acpi_gbl_FADT.month_alarm); acpi_gbl_FADT.month_alarm = 0; } cmos_rtc.day_alrm = acpi_gbl_FADT.day_alarm; cmos_rtc.mon_alrm = acpi_gbl_FADT.month_alarm; cmos_rtc.century = acpi_gbl_FADT.century; if (acpi_gbl_FADT.flags & ACPI_FADT_S4_RTC_WAKE) dev_info(dev, "RTC can wake from S4\n"); /* RTC always wakes from S1/S2/S3, and often S4/STD */ device_init_wakeup(dev, true); } static void cmos_check_acpi_rtc_status(struct device *dev, unsigned char *rtc_control) { struct cmos_rtc *cmos = dev_get_drvdata(dev); acpi_event_status rtc_status; acpi_status status; if (acpi_gbl_FADT.flags & ACPI_FADT_FIXED_RTC) return; status = acpi_get_event_status(ACPI_EVENT_RTC, &rtc_status); if (ACPI_FAILURE(status)) { dev_err(dev, "Could not get RTC status\n"); } else if (rtc_status & ACPI_EVENT_FLAG_SET) { unsigned char mask; *rtc_control &= ~RTC_AIE; CMOS_WRITE(*rtc_control, RTC_CONTROL); mask = CMOS_READ(RTC_INTR_FLAGS); rtc_update_irq(cmos->rtc, 1, mask); } } #else /* !CONFIG_ACPI */ static inline void acpi_rtc_event_setup(struct device *dev) { } static inline void acpi_rtc_event_cleanup(void) { } static inline void acpi_cmos_wake_setup(struct device *dev) { } static inline void cmos_check_acpi_rtc_status(struct device *dev, unsigned char *rtc_control) { } #endif /* CONFIG_ACPI */ #ifdef CONFIG_PNP #define INITSECTION #else #define INITSECTION __init #endif #define SECS_PER_DAY (24 * 60 * 60) #define SECS_PER_MONTH (28 * SECS_PER_DAY) #define SECS_PER_YEAR (365 * SECS_PER_DAY) static int INITSECTION cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) { struct cmos_rtc_board_info *info = dev_get_platdata(dev); int retval = 0; unsigned char rtc_control; unsigned address_space; u32 flags = 0; struct nvmem_config nvmem_cfg = { .name = "cmos_nvram", .word_size = 1, .stride = 1, .reg_read = cmos_nvram_read, .reg_write = cmos_nvram_write, .priv = &cmos_rtc, }; /* there can be only one ... */ if (cmos_rtc.dev) return -EBUSY; if (!ports) return -ENODEV; /* Claim I/O ports ASAP, minimizing conflict with legacy driver. * * REVISIT non-x86 systems may instead use memory space resources * (needing ioremap etc), not i/o space resources like this ... */ if (RTC_IOMAPPED) ports = request_region(ports->start, resource_size(ports), driver_name); else ports = request_mem_region(ports->start, resource_size(ports), driver_name); if (!ports) { dev_dbg(dev, "i/o registers already in use\n"); return -EBUSY; } cmos_rtc.irq = rtc_irq; cmos_rtc.iomem = ports; /* Heuristic to deduce NVRAM size ... do what the legacy NVRAM * driver did, but don't reject unknown configs. Old hardware * won't address 128 bytes. Newer chips have multiple banks, * though they may not be listed in one I/O resource. */ #if defined(CONFIG_ATARI) address_space = 64; #elif defined(__i386__) || defined(__x86_64__) || defined(__arm__) \ || defined(__sparc__) || defined(__mips__) \ || defined(__powerpc__) address_space = 128; #else #warning Assuming 128 bytes of RTC+NVRAM address space, not 64 bytes. address_space = 128; #endif if (can_bank2 && ports->end > (ports->start + 1)) address_space = 256; /* For ACPI systems extension info comes from the FADT. On others, * board specific setup provides it as appropriate. Systems where * the alarm IRQ isn't automatically a wakeup IRQ (like ACPI, and * some almost-clones) can provide hooks to make that behave. * * Note that ACPI doesn't preclude putting these registers into * "extended" areas of the chip, including some that we won't yet * expect CMOS_READ and friends to handle. */ if (info) { if (info->flags) flags = info->flags; if (info->address_space) address_space = info->address_space; cmos_rtc.day_alrm = info->rtc_day_alarm; cmos_rtc.mon_alrm = info->rtc_mon_alarm; cmos_rtc.century = info->rtc_century; if (info->wake_on && info->wake_off) { cmos_rtc.wake_on = info->wake_on; cmos_rtc.wake_off = info->wake_off; } } else { acpi_cmos_wake_setup(dev); } if (cmos_rtc.day_alrm >= 128) cmos_rtc.day_alrm = 0; if (cmos_rtc.mon_alrm >= 128) cmos_rtc.mon_alrm = 0; if (cmos_rtc.century >= 128) cmos_rtc.century = 0; cmos_rtc.dev = dev; dev_set_drvdata(dev, &cmos_rtc); cmos_rtc.rtc = devm_rtc_allocate_device(dev); if (IS_ERR(cmos_rtc.rtc)) { retval = PTR_ERR(cmos_rtc.rtc); goto cleanup0; } if (cmos_rtc.mon_alrm) cmos_rtc.rtc->alarm_offset_max = SECS_PER_YEAR - 1; else if (cmos_rtc.day_alrm) cmos_rtc.rtc->alarm_offset_max = SECS_PER_MONTH - 1; else cmos_rtc.rtc->alarm_offset_max = SECS_PER_DAY - 1; rename_region(ports, dev_name(&cmos_rtc.rtc->dev)); if (!mc146818_does_rtc_work()) { dev_warn(dev, "broken or not accessible\n"); retval = -ENXIO; goto cleanup1; } spin_lock_irq(&rtc_lock); if (!(flags & CMOS_RTC_FLAGS_NOFREQ)) { /* force periodic irq to CMOS reset default of 1024Hz; * * REVISIT it's been reported that at least one x86_64 ALI * mobo doesn't use 32KHz here ... for portability we might * need to do something about other clock frequencies. */ cmos_rtc.rtc->irq_freq = 1024; if (use_hpet_alarm()) hpet_set_periodic_freq(cmos_rtc.rtc->irq_freq); CMOS_WRITE(RTC_REF_CLCK_32KHZ | 0x06, RTC_FREQ_SELECT); } /* disable irqs */ if (is_valid_irq(rtc_irq)) cmos_irq_disable(&cmos_rtc, RTC_PIE | RTC_AIE | RTC_UIE); rtc_control = CMOS_READ(RTC_CONTROL); spin_unlock_irq(&rtc_lock); if (is_valid_irq(rtc_irq) && !(rtc_control & RTC_24H)) { dev_warn(dev, "only 24-hr supported\n"); retval = -ENXIO; goto cleanup1; } if (use_hpet_alarm()) hpet_rtc_timer_init(); if (is_valid_irq(rtc_irq)) { irq_handler_t rtc_cmos_int_handler; if (use_hpet_alarm()) { rtc_cmos_int_handler = hpet_rtc_interrupt; retval = hpet_register_irq_handler(cmos_interrupt); if (retval) { hpet_mask_rtc_irq_bit(RTC_IRQMASK); dev_warn(dev, "hpet_register_irq_handler " " failed in rtc_init()."); goto cleanup1; } } else rtc_cmos_int_handler = cmos_interrupt; retval = request_irq(rtc_irq, rtc_cmos_int_handler, 0, dev_name(&cmos_rtc.rtc->dev), cmos_rtc.rtc); if (retval < 0) { dev_dbg(dev, "IRQ %d is already in use\n", rtc_irq); goto cleanup1; } } else { clear_bit(RTC_FEATURE_ALARM, cmos_rtc.rtc->features); } cmos_rtc.rtc->ops = &cmos_rtc_ops; retval = devm_rtc_register_device(cmos_rtc.rtc); if (retval) goto cleanup2; /* Set the sync offset for the periodic 11min update correct */ cmos_rtc.rtc->set_offset_nsec = NSEC_PER_SEC / 2; /* export at least the first block of NVRAM */ nvmem_cfg.size = address_space - NVRAM_OFFSET; devm_rtc_nvmem_register(cmos_rtc.rtc, &nvmem_cfg); /* * Everything has gone well so far, so by default register a handler for * the ACPI RTC fixed event. */ if (!info) acpi_rtc_event_setup(dev); dev_info(dev, "%s%s, %d bytes nvram%s\n", !is_valid_irq(rtc_irq) ? "no alarms" : cmos_rtc.mon_alrm ? "alarms up to one year" : cmos_rtc.day_alrm ? "alarms up to one month" : "alarms up to one day", cmos_rtc.century ? ", y3k" : "", nvmem_cfg.size, use_hpet_alarm() ? ", hpet irqs" : ""); return 0; cleanup2: if (is_valid_irq(rtc_irq)) free_irq(rtc_irq, cmos_rtc.rtc); cleanup1: cmos_rtc.dev = NULL; cleanup0: if (RTC_IOMAPPED) release_region(ports->start, resource_size(ports)); else release_mem_region(ports->start, resource_size(ports)); return retval; } static void cmos_do_shutdown(int rtc_irq) { spin_lock_irq(&rtc_lock); if (is_valid_irq(rtc_irq)) cmos_irq_disable(&cmos_rtc, RTC_IRQMASK); spin_unlock_irq(&rtc_lock); } static void cmos_do_remove(struct device *dev) { struct cmos_rtc *cmos = dev_get_drvdata(dev); struct resource *ports; cmos_do_shutdown(cmos->irq); if (is_valid_irq(cmos->irq)) { free_irq(cmos->irq, cmos->rtc); if (use_hpet_alarm()) hpet_unregister_irq_handler(cmos_interrupt); } if (!dev_get_platdata(dev)) acpi_rtc_event_cleanup(); cmos->rtc = NULL; ports = cmos->iomem; if (RTC_IOMAPPED) release_region(ports->start, resource_size(ports)); else release_mem_region(ports->start, resource_size(ports)); cmos->iomem = NULL; cmos->dev = NULL; } static int cmos_aie_poweroff(struct device *dev) { struct cmos_rtc *cmos = dev_get_drvdata(dev); struct rtc_time now; time64_t t_now; int retval = 0; unsigned char rtc_control; if (!cmos->alarm_expires) return -EINVAL; spin_lock_irq(&rtc_lock); rtc_control = CMOS_READ(RTC_CONTROL); spin_unlock_irq(&rtc_lock); /* We only care about the situation where AIE is disabled. */ if (rtc_control & RTC_AIE) return -EBUSY; cmos_read_time(dev, &now); t_now = rtc_tm_to_time64(&now); /* * When enabling "RTC wake-up" in BIOS setup, the machine reboots * automatically right after shutdown on some buggy boxes. * This automatic rebooting issue won't happen when the alarm * time is larger than now+1 seconds. * * If the alarm time is equal to now+1 seconds, the issue can be * prevented by cancelling the alarm. */ if (cmos->alarm_expires == t_now + 1) { struct rtc_wkalrm alarm; /* Cancel the AIE timer by configuring the past time. */ rtc_time64_to_tm(t_now - 1, &alarm.time); alarm.enabled = 0; retval = cmos_set_alarm(dev, &alarm); } else if (cmos->alarm_expires > t_now + 1) { retval = -EBUSY; } return retval; } static int cmos_suspend(struct device *dev) { struct cmos_rtc *cmos = dev_get_drvdata(dev); unsigned char tmp; /* only the alarm might be a wakeup event source */ spin_lock_irq(&rtc_lock); cmos->suspend_ctrl = tmp = CMOS_READ(RTC_CONTROL); if (tmp & (RTC_PIE|RTC_AIE|RTC_UIE)) { unsigned char mask; if (device_may_wakeup(dev)) mask = RTC_IRQMASK & ~RTC_AIE; else mask = RTC_IRQMASK; tmp &= ~mask; CMOS_WRITE(tmp, RTC_CONTROL); if (use_hpet_alarm()) hpet_mask_rtc_irq_bit(mask); cmos_checkintr(cmos, tmp); } spin_unlock_irq(&rtc_lock); if ((tmp & RTC_AIE) && !cmos_use_acpi_alarm()) { cmos->enabled_wake = 1; if (cmos->wake_on) cmos->wake_on(dev); else enable_irq_wake(cmos->irq); } memset(&cmos->saved_wkalrm, 0, sizeof(struct rtc_wkalrm)); cmos_read_alarm(dev, &cmos->saved_wkalrm); dev_dbg(dev, "suspend%s, ctrl %02x\n", (tmp & RTC_AIE) ? ", alarm may wake" : "", tmp); return 0; } /* We want RTC alarms to wake us from e.g. ACPI G2/S5 "soft off", even * after a detour through G3 "mechanical off", although the ACPI spec * says wakeup should only work from G1/S4 "hibernate". To most users, * distinctions between S4 and S5 are pointless. So when the hardware * allows, don't draw that distinction. */ static inline int cmos_poweroff(struct device *dev) { if (!IS_ENABLED(CONFIG_PM)) return -ENOSYS; return cmos_suspend(dev); } static void cmos_check_wkalrm(struct device *dev) { struct cmos_rtc *cmos = dev_get_drvdata(dev); struct rtc_wkalrm current_alarm; time64_t t_now; time64_t t_current_expires; time64_t t_saved_expires; struct rtc_time now; /* Check if we have RTC Alarm armed */ if (!(cmos->suspend_ctrl & RTC_AIE)) return; cmos_read_time(dev, &now); t_now = rtc_tm_to_time64(&now); /* * ACPI RTC wake event is cleared after resume from STR, * ACK the rtc irq here */ if (t_now >= cmos->alarm_expires && cmos_use_acpi_alarm()) { cmos_interrupt(0, (void *)cmos->rtc); return; } memset(&current_alarm, 0, sizeof(struct rtc_wkalrm)); cmos_read_alarm(dev, &current_alarm); t_current_expires = rtc_tm_to_time64(&current_alarm.time); t_saved_expires = rtc_tm_to_time64(&cmos->saved_wkalrm.time); if (t_current_expires != t_saved_expires || cmos->saved_wkalrm.enabled != current_alarm.enabled) { cmos_set_alarm(dev, &cmos->saved_wkalrm); } } static int __maybe_unused cmos_resume(struct device *dev) { struct cmos_rtc *cmos = dev_get_drvdata(dev); unsigned char tmp; if (cmos->enabled_wake && !cmos_use_acpi_alarm()) { if (cmos->wake_off) cmos->wake_off(dev); else disable_irq_wake(cmos->irq); cmos->enabled_wake = 0; } /* The BIOS might have changed the alarm, restore it */ cmos_check_wkalrm(dev); spin_lock_irq(&rtc_lock); tmp = cmos->suspend_ctrl; cmos->suspend_ctrl = 0; /* re-enable any irqs previously active */ if (tmp & RTC_IRQMASK) { unsigned char mask; if (device_may_wakeup(dev) && use_hpet_alarm()) hpet_rtc_timer_init(); do { CMOS_WRITE(tmp, RTC_CONTROL); if (use_hpet_alarm()) hpet_set_rtc_irq_bit(tmp & RTC_IRQMASK); mask = CMOS_READ(RTC_INTR_FLAGS); mask &= (tmp & RTC_IRQMASK) | RTC_IRQF; if (!use_hpet_alarm() || !is_intr(mask)) break; /* force one-shot behavior if HPET blocked * the wake alarm's irq */ rtc_update_irq(cmos->rtc, 1, mask); tmp &= ~RTC_AIE; hpet_mask_rtc_irq_bit(RTC_AIE); } while (mask & RTC_AIE); if (tmp & RTC_AIE) cmos_check_acpi_rtc_status(dev, &tmp); } spin_unlock_irq(&rtc_lock); dev_dbg(dev, "resume, ctrl %02x\n", tmp); return 0; } static SIMPLE_DEV_PM_OPS(cmos_pm_ops, cmos_suspend, cmos_resume); /*----------------------------------------------------------------*/ /* On non-x86 systems, a "CMOS" RTC lives most naturally on platform_bus. * ACPI systems always list these as PNPACPI devices, and pre-ACPI PCs * probably list them in similar PNPBIOS tables; so PNP is more common. * * We don't use legacy "poke at the hardware" probing. Ancient PCs that * predate even PNPBIOS should set up platform_bus devices. */ #ifdef CONFIG_PNP #include <linux/pnp.h> static int cmos_pnp_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) { int irq; if (pnp_port_start(pnp, 0) == 0x70 && !pnp_irq_valid(pnp, 0)) { irq = 0; #ifdef CONFIG_X86 /* Some machines contain a PNP entry for the RTC, but * don't define the IRQ. It should always be safe to * hardcode it on systems with a legacy PIC. */ if (nr_legacy_irqs()) irq = RTC_IRQ; #endif } else { irq = pnp_irq(pnp, 0); } return cmos_do_probe(&pnp->dev, pnp_get_resource(pnp, IORESOURCE_IO, 0), irq); } static void cmos_pnp_remove(struct pnp_dev *pnp) { cmos_do_remove(&pnp->dev); } static void cmos_pnp_shutdown(struct pnp_dev *pnp) { struct device *dev = &pnp->dev; struct cmos_rtc *cmos = dev_get_drvdata(dev); if (system_state == SYSTEM_POWER_OFF) { int retval = cmos_poweroff(dev); if (cmos_aie_poweroff(dev) < 0 && !retval) return; } cmos_do_shutdown(cmos->irq); } static const struct pnp_device_id rtc_ids[] = { { .id = "PNP0b00", }, { .id = "PNP0b01", }, { .id = "PNP0b02", }, { }, }; MODULE_DEVICE_TABLE(pnp, rtc_ids); static struct pnp_driver cmos_pnp_driver = { .name = driver_name, .id_table = rtc_ids, .probe = cmos_pnp_probe, .remove = cmos_pnp_remove, .shutdown = cmos_pnp_shutdown, /* flag ensures resume() gets called, and stops syslog spam */ .flags = PNP_DRIVER_RES_DO_NOT_CHANGE, .driver = { .pm = &cmos_pm_ops, }, }; #endif /* CONFIG_PNP */ #ifdef CONFIG_OF static const struct of_device_id of_cmos_match[] = { { .compatible = "motorola,mc146818", }, { }, }; MODULE_DEVICE_TABLE(of, of_cmos_match); static __init void cmos_of_init(struct platform_device *pdev) { struct device_node *node = pdev->dev.of_node; const __be32 *val; if (!node) return; val = of_get_property(node, "ctrl-reg", NULL); if (val) CMOS_WRITE(be32_to_cpup(val), RTC_CONTROL); val = of_get_property(node, "freq-reg", NULL); if (val) CMOS_WRITE(be32_to_cpup(val), RTC_FREQ_SELECT); } #else static inline void cmos_of_init(struct platform_device *pdev) {} #endif /*----------------------------------------------------------------*/ /* Platform setup should have set up an RTC device, when PNP is * unavailable ... this could happen even on (older) PCs. */ static int __init cmos_platform_probe(struct platform_device *pdev) { struct resource *resource; int irq; cmos_of_init(pdev); if (RTC_IOMAPPED) resource = platform_get_resource(pdev, IORESOURCE_IO, 0); else resource = platform_get_resource(pdev, IORESOURCE_MEM, 0); irq = platform_get_irq(pdev, 0); if (irq < 0) irq = -1; return cmos_do_probe(&pdev->dev, resource, irq); } static void cmos_platform_remove(struct platform_device *pdev) { cmos_do_remove(&pdev->dev); } static void cmos_platform_shutdown(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct cmos_rtc *cmos = dev_get_drvdata(dev); if (system_state == SYSTEM_POWER_OFF) { int retval = cmos_poweroff(dev); if (cmos_aie_poweroff(dev) < 0 && !retval) return; } cmos_do_shutdown(cmos->irq); } /* work with hotplug and coldplug */ MODULE_ALIAS("platform:rtc_cmos"); static struct platform_driver cmos_platform_driver = { .remove = cmos_platform_remove, .shutdown = cmos_platform_shutdown, .driver = { .name = driver_name, .pm = &cmos_pm_ops, .of_match_table = of_match_ptr(of_cmos_match), } }; #ifdef CONFIG_PNP static bool pnp_driver_registered; #endif static bool platform_driver_registered; static int __init cmos_init(void) { int retval = 0; #ifdef CONFIG_PNP retval = pnp_register_driver(&cmos_pnp_driver); if (retval == 0) pnp_driver_registered = true; #endif if (!cmos_rtc.dev) { retval = platform_driver_probe(&cmos_platform_driver, cmos_platform_probe); if (retval == 0) platform_driver_registered = true; } if (retval == 0) return 0; #ifdef CONFIG_PNP if (pnp_driver_registered) pnp_unregister_driver(&cmos_pnp_driver); #endif return retval; } module_init(cmos_init); static void __exit cmos_exit(void) { #ifdef CONFIG_PNP if (pnp_driver_registered) pnp_unregister_driver(&cmos_pnp_driver); #endif if (platform_driver_registered) platform_driver_unregister(&cmos_platform_driver); } module_exit(cmos_exit); MODULE_AUTHOR("David Brownell"); MODULE_DESCRIPTION("Driver for PC-style 'CMOS' RTCs"); MODULE_LICENSE("GPL");
2 2 2 2 1 3 3 2 1 1 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 /* HIDP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org> Copyright (C) 2013 David Herrmann <dh.herrmann@gmail.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/kref.h> #include <linux/module.h> #include <linux/file.h> #include <linux/kthread.h> #include <linux/hidraw.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include "hidp.h" #define VERSION "1.2" static DECLARE_RWSEM(hidp_session_sem); static DECLARE_WAIT_QUEUE_HEAD(hidp_session_wq); static LIST_HEAD(hidp_session_list); static unsigned char hidp_keycode[256] = { 0, 0, 0, 0, 30, 48, 46, 32, 18, 33, 34, 35, 23, 36, 37, 38, 50, 49, 24, 25, 16, 19, 31, 20, 22, 47, 17, 45, 21, 44, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 1, 14, 15, 57, 12, 13, 26, 27, 43, 43, 39, 40, 41, 51, 52, 53, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 87, 88, 99, 70, 119, 110, 102, 104, 111, 107, 109, 106, 105, 108, 103, 69, 98, 55, 74, 78, 96, 79, 80, 81, 75, 76, 77, 71, 72, 73, 82, 83, 86, 127, 116, 117, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 134, 138, 130, 132, 128, 129, 131, 137, 133, 135, 136, 113, 115, 114, 0, 0, 0, 121, 0, 89, 93, 124, 92, 94, 95, 0, 0, 0, 122, 123, 90, 91, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 42, 56, 125, 97, 54, 100, 126, 164, 166, 165, 163, 161, 115, 114, 113, 150, 158, 159, 128, 136, 177, 178, 176, 142, 152, 173, 140 }; static unsigned char hidp_mkeyspat[] = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }; static int hidp_session_probe(struct l2cap_conn *conn, struct l2cap_user *user); static void hidp_session_remove(struct l2cap_conn *conn, struct l2cap_user *user); static int hidp_session_thread(void *arg); static void hidp_session_terminate(struct hidp_session *s); static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci) { u32 valid_flags = 0; memset(ci, 0, sizeof(*ci)); bacpy(&ci->bdaddr, &session->bdaddr); ci->flags = session->flags & valid_flags; ci->state = BT_CONNECTED; if (session->input) { ci->vendor = session->input->id.vendor; ci->product = session->input->id.product; ci->version = session->input->id.version; if (session->input->name) strscpy(ci->name, session->input->name, 128); else strscpy(ci->name, "HID Boot Device", 128); } else if (session->hid) { ci->vendor = session->hid->vendor; ci->product = session->hid->product; ci->version = session->hid->version; strscpy(ci->name, session->hid->name, 128); } } /* assemble skb, queue message on @transmit and wake up the session thread */ static int hidp_send_message(struct hidp_session *session, struct socket *sock, struct sk_buff_head *transmit, unsigned char hdr, const unsigned char *data, int size) { struct sk_buff *skb; struct sock *sk = sock->sk; int ret; BT_DBG("session %p data %p size %d", session, data, size); if (atomic_read(&session->terminate)) return -EIO; skb = alloc_skb(size + 1, GFP_ATOMIC); if (!skb) { BT_ERR("Can't allocate memory for new frame"); return -ENOMEM; } skb_put_u8(skb, hdr); if (data && size > 0) { skb_put_data(skb, data, size); ret = size; } else { ret = 0; } skb_queue_tail(transmit, skb); wake_up_interruptible(sk_sleep(sk)); return ret; } static int hidp_send_ctrl_message(struct hidp_session *session, unsigned char hdr, const unsigned char *data, int size) { return hidp_send_message(session, session->ctrl_sock, &session->ctrl_transmit, hdr, data, size); } static int hidp_send_intr_message(struct hidp_session *session, unsigned char hdr, const unsigned char *data, int size) { return hidp_send_message(session, session->intr_sock, &session->intr_transmit, hdr, data, size); } static int hidp_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { struct hidp_session *session = input_get_drvdata(dev); unsigned char newleds; unsigned char hdr, data[2]; BT_DBG("session %p type %d code %d value %d", session, type, code, value); if (type != EV_LED) return -1; newleds = (!!test_bit(LED_KANA, dev->led) << 3) | (!!test_bit(LED_COMPOSE, dev->led) << 3) | (!!test_bit(LED_SCROLLL, dev->led) << 2) | (!!test_bit(LED_CAPSL, dev->led) << 1) | (!!test_bit(LED_NUML, dev->led) << 0); if (session->leds == newleds) return 0; session->leds = newleds; hdr = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; data[0] = 0x01; data[1] = newleds; return hidp_send_intr_message(session, hdr, data, 2); } static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) { struct input_dev *dev = session->input; unsigned char *keys = session->keys; unsigned char *udata = skb->data + 1; signed char *sdata = skb->data + 1; int i, size = skb->len - 1; switch (skb->data[0]) { case 0x01: /* Keyboard report */ for (i = 0; i < 8; i++) input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1); /* If all the key codes have been set to 0x01, it means * too many keys were pressed at the same time. */ if (!memcmp(udata + 2, hidp_mkeyspat, 6)) break; for (i = 2; i < 8; i++) { if (keys[i] > 3 && memscan(udata + 2, keys[i], 6) == udata + 8) { if (hidp_keycode[keys[i]]) input_report_key(dev, hidp_keycode[keys[i]], 0); else BT_ERR("Unknown key (scancode %#x) released.", keys[i]); } if (udata[i] > 3 && memscan(keys + 2, udata[i], 6) == keys + 8) { if (hidp_keycode[udata[i]]) input_report_key(dev, hidp_keycode[udata[i]], 1); else BT_ERR("Unknown key (scancode %#x) pressed.", udata[i]); } } memcpy(keys, udata, 8); break; case 0x02: /* Mouse report */ input_report_key(dev, BTN_LEFT, sdata[0] & 0x01); input_report_key(dev, BTN_RIGHT, sdata[0] & 0x02); input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04); input_report_key(dev, BTN_SIDE, sdata[0] & 0x08); input_report_key(dev, BTN_EXTRA, sdata[0] & 0x10); input_report_rel(dev, REL_X, sdata[1]); input_report_rel(dev, REL_Y, sdata[2]); if (size > 3) input_report_rel(dev, REL_WHEEL, sdata[3]); break; } input_sync(dev); } static int hidp_get_raw_report(struct hid_device *hid, unsigned char report_number, unsigned char *data, size_t count, unsigned char report_type) { struct hidp_session *session = hid->driver_data; struct sk_buff *skb; size_t len; int numbered_reports = hid->report_enum[report_type].numbered; int ret; if (atomic_read(&session->terminate)) return -EIO; switch (report_type) { case HID_FEATURE_REPORT: report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_FEATURE; break; case HID_INPUT_REPORT: report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_INPUT; break; case HID_OUTPUT_REPORT: report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_OUPUT; break; default: return -EINVAL; } if (mutex_lock_interruptible(&session->report_mutex)) return -ERESTARTSYS; /* Set up our wait, and send the report request to the device. */ session->waiting_report_type = report_type & HIDP_DATA_RTYPE_MASK; session->waiting_report_number = numbered_reports ? report_number : -1; set_bit(HIDP_WAITING_FOR_RETURN, &session->flags); data[0] = report_number; ret = hidp_send_ctrl_message(session, report_type, data, 1); if (ret < 0) goto err; /* Wait for the return of the report. The returned report gets put in session->report_return. */ while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) && !atomic_read(&session->terminate)) { int res; res = wait_event_interruptible_timeout(session->report_queue, !test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) || atomic_read(&session->terminate), 5*HZ); if (res == 0) { /* timeout */ ret = -EIO; goto err; } if (res < 0) { /* signal */ ret = -ERESTARTSYS; goto err; } } skb = session->report_return; if (skb) { len = skb->len < count ? skb->len : count; memcpy(data, skb->data, len); kfree_skb(skb); session->report_return = NULL; } else { /* Device returned a HANDSHAKE, indicating protocol error. */ len = -EIO; } clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); mutex_unlock(&session->report_mutex); return len; err: clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); mutex_unlock(&session->report_mutex); return ret; } static int hidp_set_raw_report(struct hid_device *hid, unsigned char reportnum, unsigned char *data, size_t count, unsigned char report_type) { struct hidp_session *session = hid->driver_data; int ret; switch (report_type) { case HID_FEATURE_REPORT: report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; break; case HID_INPUT_REPORT: report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_INPUT; break; case HID_OUTPUT_REPORT: report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_OUPUT; break; default: return -EINVAL; } if (mutex_lock_interruptible(&session->report_mutex)) return -ERESTARTSYS; /* Set up our wait, and send the report request to the device. */ data[0] = reportnum; set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); ret = hidp_send_ctrl_message(session, report_type, data, count); if (ret < 0) goto err; /* Wait for the ACK from the device. */ while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags) && !atomic_read(&session->terminate)) { int res; res = wait_event_interruptible_timeout(session->report_queue, !test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags) || atomic_read(&session->terminate), 10*HZ); if (res == 0) { /* timeout */ ret = -EIO; goto err; } if (res < 0) { /* signal */ ret = -ERESTARTSYS; goto err; } } if (!session->output_report_success) { ret = -EIO; goto err; } ret = count; err: clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); mutex_unlock(&session->report_mutex); return ret; } static int hidp_output_report(struct hid_device *hid, __u8 *data, size_t count) { struct hidp_session *session = hid->driver_data; return hidp_send_intr_message(session, HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT, data, count); } static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype) { switch (reqtype) { case HID_REQ_GET_REPORT: return hidp_get_raw_report(hid, reportnum, buf, len, rtype); case HID_REQ_SET_REPORT: return hidp_set_raw_report(hid, reportnum, buf, len, rtype); default: return -EIO; } } static void hidp_idle_timeout(struct timer_list *t) { struct hidp_session *session = timer_container_of(session, t, timer); /* The HIDP user-space API only contains calls to add and remove * devices. There is no way to forward events of any kind. Therefore, * we have to forcefully disconnect a device on idle-timeouts. This is * unfortunate and weird API design, but it is spec-compliant and * required for backwards-compatibility. Hence, on idle-timeout, we * signal driver-detach events, so poll() will be woken up with an * error-condition on both sockets. */ session->intr_sock->sk->sk_err = EUNATCH; session->ctrl_sock->sk->sk_err = EUNATCH; wake_up_interruptible(sk_sleep(session->intr_sock->sk)); wake_up_interruptible(sk_sleep(session->ctrl_sock->sk)); hidp_session_terminate(session); } static void hidp_set_timer(struct hidp_session *session) { if (session->idle_to > 0) mod_timer(&session->timer, jiffies + HZ * session->idle_to); } static void hidp_del_timer(struct hidp_session *session) { if (session->idle_to > 0) timer_delete_sync(&session->timer); } static void hidp_process_report(struct hidp_session *session, int type, const u8 *data, unsigned int len, int intr) { if (len > HID_MAX_BUFFER_SIZE) len = HID_MAX_BUFFER_SIZE; memcpy(session->input_buf, data, len); hid_input_report(session->hid, type, session->input_buf, len, intr); } static void hidp_process_handshake(struct hidp_session *session, unsigned char param) { BT_DBG("session %p param 0x%02x", session, param); session->output_report_success = 0; /* default condition */ switch (param) { case HIDP_HSHK_SUCCESSFUL: /* FIXME: Call into SET_ GET_ handlers here */ session->output_report_success = 1; break; case HIDP_HSHK_NOT_READY: case HIDP_HSHK_ERR_INVALID_REPORT_ID: case HIDP_HSHK_ERR_UNSUPPORTED_REQUEST: case HIDP_HSHK_ERR_INVALID_PARAMETER: if (test_and_clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) wake_up_interruptible(&session->report_queue); /* FIXME: Call into SET_ GET_ handlers here */ break; case HIDP_HSHK_ERR_UNKNOWN: break; case HIDP_HSHK_ERR_FATAL: /* Device requests a reboot, as this is the only way this error * can be recovered. */ hidp_send_ctrl_message(session, HIDP_TRANS_HID_CONTROL | HIDP_CTRL_SOFT_RESET, NULL, 0); break; default: hidp_send_ctrl_message(session, HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); break; } /* Wake up the waiting thread. */ if (test_and_clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) wake_up_interruptible(&session->report_queue); } static void hidp_process_hid_control(struct hidp_session *session, unsigned char param) { BT_DBG("session %p param 0x%02x", session, param); if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) { /* Flush the transmit queues */ skb_queue_purge(&session->ctrl_transmit); skb_queue_purge(&session->intr_transmit); hidp_session_terminate(session); } } /* Returns true if the passed-in skb should be freed by the caller. */ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param) { int done_with_skb = 1; BT_DBG("session %p skb %p len %u param 0x%02x", session, skb, skb->len, param); switch (param) { case HIDP_DATA_RTYPE_INPUT: hidp_set_timer(session); if (session->input) hidp_input_report(session, skb); if (session->hid) hidp_process_report(session, HID_INPUT_REPORT, skb->data, skb->len, 0); break; case HIDP_DATA_RTYPE_OTHER: case HIDP_DATA_RTYPE_OUPUT: case HIDP_DATA_RTYPE_FEATURE: break; default: hidp_send_ctrl_message(session, HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); } if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) && param == session->waiting_report_type) { if (session->waiting_report_number < 0 || session->waiting_report_number == skb->data[0]) { /* hidp_get_raw_report() is waiting on this report. */ session->report_return = skb; done_with_skb = 0; clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); wake_up_interruptible(&session->report_queue); } } return done_with_skb; } static void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_buff *skb) { unsigned char hdr, type, param; int free_skb = 1; BT_DBG("session %p skb %p len %u", session, skb, skb->len); hdr = skb->data[0]; skb_pull(skb, 1); type = hdr & HIDP_HEADER_TRANS_MASK; param = hdr & HIDP_HEADER_PARAM_MASK; switch (type) { case HIDP_TRANS_HANDSHAKE: hidp_process_handshake(session, param); break; case HIDP_TRANS_HID_CONTROL: hidp_process_hid_control(session, param); break; case HIDP_TRANS_DATA: free_skb = hidp_process_data(session, skb, param); break; default: hidp_send_ctrl_message(session, HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_UNSUPPORTED_REQUEST, NULL, 0); break; } if (free_skb) kfree_skb(skb); } static void hidp_recv_intr_frame(struct hidp_session *session, struct sk_buff *skb) { unsigned char hdr; BT_DBG("session %p skb %p len %u", session, skb, skb->len); hdr = skb->data[0]; skb_pull(skb, 1); if (hdr == (HIDP_TRANS_DATA | HIDP_DATA_RTYPE_INPUT)) { hidp_set_timer(session); if (session->input) hidp_input_report(session, skb); if (session->hid) { hidp_process_report(session, HID_INPUT_REPORT, skb->data, skb->len, 1); BT_DBG("report len %d", skb->len); } } else { BT_DBG("Unsupported protocol header 0x%02x", hdr); } kfree_skb(skb); } static int hidp_send_frame(struct socket *sock, unsigned char *data, int len) { struct kvec iv = { data, len }; struct msghdr msg; BT_DBG("sock %p data %p len %d", sock, data, len); if (!len) return 0; memset(&msg, 0, sizeof(msg)); return kernel_sendmsg(sock, &msg, &iv, 1, len); } /* dequeue message from @transmit and send via @sock */ static void hidp_process_transmit(struct hidp_session *session, struct sk_buff_head *transmit, struct socket *sock) { struct sk_buff *skb; int ret; BT_DBG("session %p", session); while ((skb = skb_dequeue(transmit))) { ret = hidp_send_frame(sock, skb->data, skb->len); if (ret == -EAGAIN) { skb_queue_head(transmit, skb); break; } else if (ret < 0) { hidp_session_terminate(session); kfree_skb(skb); break; } hidp_set_timer(session); kfree_skb(skb); } } static int hidp_setup_input(struct hidp_session *session, const struct hidp_connadd_req *req) { struct input_dev *input; int i; input = input_allocate_device(); if (!input) return -ENOMEM; session->input = input; input_set_drvdata(input, session); input->name = "Bluetooth HID Boot Protocol Device"; input->id.bustype = BUS_BLUETOOTH; input->id.vendor = req->vendor; input->id.product = req->product; input->id.version = req->version; if (req->subclass & 0x40) { set_bit(EV_KEY, input->evbit); set_bit(EV_LED, input->evbit); set_bit(EV_REP, input->evbit); set_bit(LED_NUML, input->ledbit); set_bit(LED_CAPSL, input->ledbit); set_bit(LED_SCROLLL, input->ledbit); set_bit(LED_COMPOSE, input->ledbit); set_bit(LED_KANA, input->ledbit); for (i = 0; i < sizeof(hidp_keycode); i++) set_bit(hidp_keycode[i], input->keybit); clear_bit(0, input->keybit); } if (req->subclass & 0x80) { input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); input->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); input->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); input->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_SIDE) | BIT_MASK(BTN_EXTRA); input->relbit[0] |= BIT_MASK(REL_WHEEL); } input->dev.parent = &session->conn->hcon->dev; input->event = hidp_input_event; return 0; } static int hidp_open(struct hid_device *hid) { return 0; } static void hidp_close(struct hid_device *hid) { } static int hidp_parse(struct hid_device *hid) { struct hidp_session *session = hid->driver_data; return hid_parse_report(session->hid, session->rd_data, session->rd_size); } static int hidp_start(struct hid_device *hid) { return 0; } static void hidp_stop(struct hid_device *hid) { struct hidp_session *session = hid->driver_data; skb_queue_purge(&session->ctrl_transmit); skb_queue_purge(&session->intr_transmit); hid->claimed = 0; } static const struct hid_ll_driver hidp_hid_driver = { .parse = hidp_parse, .start = hidp_start, .stop = hidp_stop, .open = hidp_open, .close = hidp_close, .raw_request = hidp_raw_request, .output_report = hidp_output_report, }; /* This function sets up the hid device. It does not add it to the HID system. That is done in hidp_add_connection(). */ static int hidp_setup_hid(struct hidp_session *session, const struct hidp_connadd_req *req) { struct hid_device *hid; int err; session->rd_data = memdup_user(req->rd_data, req->rd_size); if (IS_ERR(session->rd_data)) return PTR_ERR(session->rd_data); session->rd_size = req->rd_size; hid = hid_allocate_device(); if (IS_ERR(hid)) { err = PTR_ERR(hid); goto fault; } session->hid = hid; hid->driver_data = session; hid->bus = BUS_BLUETOOTH; hid->vendor = req->vendor; hid->product = req->product; hid->version = req->version; hid->country = req->country; strscpy(hid->name, req->name, sizeof(hid->name)); snprintf(hid->phys, sizeof(hid->phys), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->src); /* NOTE: Some device modules depend on the dst address being stored in * uniq. Please be aware of this before making changes to this behavior. */ snprintf(hid->uniq, sizeof(hid->uniq), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->dst); hid->dev.parent = &session->conn->hcon->dev; hid->ll_driver = &hidp_hid_driver; /* True if device is blocked in drivers/hid/hid-quirks.c */ if (hid_ignore(hid)) { hid_destroy_device(session->hid); session->hid = NULL; return -ENODEV; } return 0; fault: kfree(session->rd_data); session->rd_data = NULL; return err; } /* initialize session devices */ static int hidp_session_dev_init(struct hidp_session *session, const struct hidp_connadd_req *req) { int ret; if (req->rd_size > 0) { ret = hidp_setup_hid(session, req); if (ret && ret != -ENODEV) return ret; } if (!session->hid) { ret = hidp_setup_input(session, req); if (ret < 0) return ret; } return 0; } /* destroy session devices */ static void hidp_session_dev_destroy(struct hidp_session *session) { if (session->hid) put_device(&session->hid->dev); else if (session->input) input_put_device(session->input); kfree(session->rd_data); session->rd_data = NULL; } /* add HID/input devices to their underlying bus systems */ static int hidp_session_dev_add(struct hidp_session *session) { int ret; /* Both HID and input systems drop a ref-count when unregistering the * device but they don't take a ref-count when registering them. Work * around this by explicitly taking a refcount during registration * which is dropped automatically by unregistering the devices. */ if (session->hid) { ret = hid_add_device(session->hid); if (ret) return ret; get_device(&session->hid->dev); } else if (session->input) { ret = input_register_device(session->input); if (ret) return ret; input_get_device(session->input); } return 0; } /* remove HID/input devices from their bus systems */ static void hidp_session_dev_del(struct hidp_session *session) { if (session->hid) hid_destroy_device(session->hid); else if (session->input) input_unregister_device(session->input); } /* * Asynchronous device registration * HID device drivers might want to perform I/O during initialization to * detect device types. Therefore, call device registration in a separate * worker so the HIDP thread can schedule I/O operations. * Note that this must be called after the worker thread was initialized * successfully. This will then add the devices and increase session state * on success, otherwise it will terminate the session thread. */ static void hidp_session_dev_work(struct work_struct *work) { struct hidp_session *session = container_of(work, struct hidp_session, dev_init); int ret; ret = hidp_session_dev_add(session); if (!ret) atomic_inc(&session->state); else hidp_session_terminate(session); } /* * Create new session object * Allocate session object, initialize static fields, copy input data into the * object and take a reference to all sub-objects. * This returns 0 on success and puts a pointer to the new session object in * \out. Otherwise, an error code is returned. * The new session object has an initial ref-count of 1. */ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, struct socket *ctrl_sock, struct socket *intr_sock, const struct hidp_connadd_req *req, struct l2cap_conn *conn) { struct hidp_session *session; int ret; struct bt_sock *ctrl, *intr; ctrl = bt_sk(ctrl_sock->sk); intr = bt_sk(intr_sock->sk); session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) return -ENOMEM; /* object and runtime management */ kref_init(&session->ref); atomic_set(&session->state, HIDP_SESSION_IDLING); init_waitqueue_head(&session->state_queue); session->flags = req->flags & BIT(HIDP_BLUETOOTH_VENDOR_ID); /* connection management */ bacpy(&session->bdaddr, bdaddr); session->conn = l2cap_conn_get(conn); session->user.probe = hidp_session_probe; session->user.remove = hidp_session_remove; INIT_LIST_HEAD(&session->user.list); session->ctrl_sock = ctrl_sock; session->intr_sock = intr_sock; skb_queue_head_init(&session->ctrl_transmit); skb_queue_head_init(&session->intr_transmit); session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl)->chan->omtu, l2cap_pi(ctrl)->chan->imtu); session->intr_mtu = min_t(uint, l2cap_pi(intr)->chan->omtu, l2cap_pi(intr)->chan->imtu); session->idle_to = req->idle_to; /* device management */ INIT_WORK(&session->dev_init, hidp_session_dev_work); timer_setup(&session->timer, hidp_idle_timeout, 0); /* session data */ mutex_init(&session->report_mutex); init_waitqueue_head(&session->report_queue); ret = hidp_session_dev_init(session, req); if (ret) goto err_free; get_file(session->intr_sock->file); get_file(session->ctrl_sock->file); *out = session; return 0; err_free: l2cap_conn_put(session->conn); kfree(session); return ret; } /* increase ref-count of the given session by one */ static void hidp_session_get(struct hidp_session *session) { kref_get(&session->ref); } /* release callback */ static void session_free(struct kref *ref) { struct hidp_session *session = container_of(ref, struct hidp_session, ref); hidp_session_dev_destroy(session); skb_queue_purge(&session->ctrl_transmit); skb_queue_purge(&session->intr_transmit); fput(session->intr_sock->file); fput(session->ctrl_sock->file); l2cap_conn_put(session->conn); kfree(session); } /* decrease ref-count of the given session by one */ static void hidp_session_put(struct hidp_session *session) { kref_put(&session->ref, session_free); } /* * Search the list of active sessions for a session with target address * \bdaddr. You must hold at least a read-lock on \hidp_session_sem. As long as * you do not release this lock, the session objects cannot vanish and you can * safely take a reference to the session yourself. */ static struct hidp_session *__hidp_session_find(const bdaddr_t *bdaddr) { struct hidp_session *session; list_for_each_entry(session, &hidp_session_list, list) { if (!bacmp(bdaddr, &session->bdaddr)) return session; } return NULL; } /* * Same as __hidp_session_find() but no locks must be held. This also takes a * reference of the returned session (if non-NULL) so you must drop this * reference if you no longer use the object. */ static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr) { struct hidp_session *session; down_read(&hidp_session_sem); session = __hidp_session_find(bdaddr); if (session) hidp_session_get(session); up_read(&hidp_session_sem); return session; } /* * Start session synchronously * This starts a session thread and waits until initialization * is done or returns an error if it couldn't be started. * If this returns 0 the session thread is up and running. You must call * hipd_session_stop_sync() before deleting any runtime resources. */ static int hidp_session_start_sync(struct hidp_session *session) { unsigned int vendor, product; if (session->hid) { vendor = session->hid->vendor; product = session->hid->product; } else if (session->input) { vendor = session->input->id.vendor; product = session->input->id.product; } else { vendor = 0x0000; product = 0x0000; } session->task = kthread_run(hidp_session_thread, session, "khidpd_%04x%04x", vendor, product); if (IS_ERR(session->task)) return PTR_ERR(session->task); while (atomic_read(&session->state) <= HIDP_SESSION_IDLING) wait_event(session->state_queue, atomic_read(&session->state) > HIDP_SESSION_IDLING); return 0; } /* * Terminate session thread * Wake up session thread and notify it to stop. This is asynchronous and * returns immediately. Call this whenever a runtime error occurs and you want * the session to stop. * Note: wake_up_interruptible() performs any necessary memory-barriers for us. */ static void hidp_session_terminate(struct hidp_session *session) { atomic_inc(&session->terminate); /* * See the comment preceding the call to wait_woken() * in hidp_session_run(). */ wake_up_interruptible(&hidp_session_wq); } /* * Probe HIDP session * This is called from the l2cap_conn core when our l2cap_user object is bound * to the hci-connection. We get the session via the \user object and can now * start the session thread, link it into the global session list and * schedule HID/input device registration. * The global session-list owns its own reference to the session object so you * can drop your own reference after registering the l2cap_user object. */ static int hidp_session_probe(struct l2cap_conn *conn, struct l2cap_user *user) { struct hidp_session *session = container_of(user, struct hidp_session, user); struct hidp_session *s; int ret; down_write(&hidp_session_sem); /* check that no other session for this device exists */ s = __hidp_session_find(&session->bdaddr); if (s) { ret = -EEXIST; goto out_unlock; } if (session->input) { ret = hidp_session_dev_add(session); if (ret) goto out_unlock; } ret = hidp_session_start_sync(session); if (ret) goto out_del; /* HID device registration is async to allow I/O during probe */ if (session->input) atomic_inc(&session->state); else schedule_work(&session->dev_init); hidp_session_get(session); list_add(&session->list, &hidp_session_list); ret = 0; goto out_unlock; out_del: if (session->input) hidp_session_dev_del(session); out_unlock: up_write(&hidp_session_sem); return ret; } /* * Remove HIDP session * Called from the l2cap_conn core when either we explicitly unregistered * the l2cap_user object or if the underlying connection is shut down. * We signal the hidp-session thread to shut down, unregister the HID/input * devices and unlink the session from the global list. * This drops the reference to the session that is owned by the global * session-list. * Note: We _must_ not synchronosly wait for the session-thread to shut down. * This is, because the session-thread might be waiting for an HCI lock that is * held while we are called. Therefore, we only unregister the devices and * notify the session-thread to terminate. The thread itself owns a reference * to the session object so it can safely shut down. */ static void hidp_session_remove(struct l2cap_conn *conn, struct l2cap_user *user) { struct hidp_session *session = container_of(user, struct hidp_session, user); down_write(&hidp_session_sem); hidp_session_terminate(session); cancel_work_sync(&session->dev_init); if (session->input || atomic_read(&session->state) > HIDP_SESSION_PREPARING) hidp_session_dev_del(session); list_del(&session->list); up_write(&hidp_session_sem); hidp_session_put(session); } /* * Session Worker * This performs the actual main-loop of the HIDP worker. We first check * whether the underlying connection is still alive, then parse all pending * messages and finally send all outstanding messages. */ static void hidp_session_run(struct hidp_session *session) { struct sock *ctrl_sk = session->ctrl_sock->sk; struct sock *intr_sk = session->intr_sock->sk; struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&hidp_session_wq, &wait); for (;;) { /* * This thread can be woken up two ways: * - You call hidp_session_terminate() which sets the * session->terminate flag and wakes this thread up. * - Via modifying the socket state of ctrl/intr_sock. This * thread is woken up by ->sk_state_changed(). */ if (atomic_read(&session->terminate)) break; if (ctrl_sk->sk_state != BT_CONNECTED || intr_sk->sk_state != BT_CONNECTED) break; /* parse incoming intr-skbs */ while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) { skb_orphan(skb); if (!skb_linearize(skb)) hidp_recv_intr_frame(session, skb); else kfree_skb(skb); } /* send pending intr-skbs */ hidp_process_transmit(session, &session->intr_transmit, session->intr_sock); /* parse incoming ctrl-skbs */ while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) { skb_orphan(skb); if (!skb_linearize(skb)) hidp_recv_ctrl_frame(session, skb); else kfree_skb(skb); } /* send pending ctrl-skbs */ hidp_process_transmit(session, &session->ctrl_transmit, session->ctrl_sock); /* * wait_woken() performs the necessary memory barriers * for us; see the header comment for this primitive. */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&hidp_session_wq, &wait); atomic_inc(&session->terminate); } static int hidp_session_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { wake_up_interruptible(&hidp_session_wq); return false; } /* * HIDP session thread * This thread runs the I/O for a single HIDP session. Startup is synchronous * which allows us to take references to ourself here instead of doing that in * the caller. * When we are ready to run we notify the caller and call hidp_session_run(). */ static int hidp_session_thread(void *arg) { struct hidp_session *session = arg; DEFINE_WAIT_FUNC(ctrl_wait, hidp_session_wake_function); DEFINE_WAIT_FUNC(intr_wait, hidp_session_wake_function); BT_DBG("session %p", session); /* initialize runtime environment */ hidp_session_get(session); __module_get(THIS_MODULE); set_user_nice(current, -15); hidp_set_timer(session); add_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait); add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); /* This memory barrier is paired with wq_has_sleeper(). See * sock_poll_wait() for more information why this is needed. */ smp_mb__before_atomic(); /* notify synchronous startup that we're ready */ atomic_inc(&session->state); wake_up(&session->state_queue); /* run session */ hidp_session_run(session); /* cleanup runtime environment */ remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); remove_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait); wake_up_interruptible(&session->report_queue); hidp_del_timer(session); /* * If we stopped ourself due to any internal signal, we should try to * unregister our own session here to avoid having it linger until the * parent l2cap_conn dies or user-space cleans it up. * This does not deadlock as we don't do any synchronous shutdown. * Instead, this call has the same semantics as if user-space tried to * delete the session. */ l2cap_unregister_user(session->conn, &session->user); hidp_session_put(session); module_put_and_kthread_exit(0); return 0; } static int hidp_verify_sockets(struct socket *ctrl_sock, struct socket *intr_sock) { struct l2cap_chan *ctrl_chan, *intr_chan; struct bt_sock *ctrl, *intr; struct hidp_session *session; if (!l2cap_is_socket(ctrl_sock) || !l2cap_is_socket(intr_sock)) return -EINVAL; ctrl_chan = l2cap_pi(ctrl_sock->sk)->chan; intr_chan = l2cap_pi(intr_sock->sk)->chan; if (bacmp(&ctrl_chan->src, &intr_chan->src) || bacmp(&ctrl_chan->dst, &intr_chan->dst)) return -ENOTUNIQ; ctrl = bt_sk(ctrl_sock->sk); intr = bt_sk(intr_sock->sk); if (ctrl->sk.sk_state != BT_CONNECTED || intr->sk.sk_state != BT_CONNECTED) return -EBADFD; /* early session check, we check again during session registration */ session = hidp_session_find(&ctrl_chan->dst); if (session) { hidp_session_put(session); return -EEXIST; } return 0; } int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) { u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG) | BIT(HIDP_BOOT_PROTOCOL_MODE); struct hidp_session *session; struct l2cap_conn *conn; struct l2cap_chan *chan; int ret; ret = hidp_verify_sockets(ctrl_sock, intr_sock); if (ret) return ret; if (req->flags & ~valid_flags) return -EINVAL; chan = l2cap_pi(ctrl_sock->sk)->chan; conn = NULL; l2cap_chan_lock(chan); if (chan->conn) conn = l2cap_conn_get(chan->conn); l2cap_chan_unlock(chan); if (!conn) return -EBADFD; ret = hidp_session_new(&session, &chan->dst, ctrl_sock, intr_sock, req, conn); if (ret) goto out_conn; ret = l2cap_register_user(conn, &session->user); if (ret) goto out_session; ret = 0; out_session: hidp_session_put(session); out_conn: l2cap_conn_put(conn); return ret; } int hidp_connection_del(struct hidp_conndel_req *req) { u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG); struct hidp_session *session; if (req->flags & ~valid_flags) return -EINVAL; session = hidp_session_find(&req->bdaddr); if (!session) return -ENOENT; if (req->flags & BIT(HIDP_VIRTUAL_CABLE_UNPLUG)) hidp_send_ctrl_message(session, HIDP_TRANS_HID_CONTROL | HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0); else l2cap_unregister_user(session->conn, &session->user); hidp_session_put(session); return 0; } int hidp_get_connlist(struct hidp_connlist_req *req) { struct hidp_session *session; int err = 0, n = 0; BT_DBG(""); down_read(&hidp_session_sem); list_for_each_entry(session, &hidp_session_list, list) { struct hidp_conninfo ci; hidp_copy_session(session, &ci); if (copy_to_user(req->ci, &ci, sizeof(ci))) { err = -EFAULT; break; } if (++n >= req->cnum) break; req->ci++; } req->cnum = n; up_read(&hidp_session_sem); return err; } int hidp_get_conninfo(struct hidp_conninfo *ci) { struct hidp_session *session; session = hidp_session_find(&ci->bdaddr); if (session) { hidp_copy_session(session, ci); hidp_session_put(session); } return session ? 0 : -ENOENT; } static int __init hidp_init(void) { BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION); return hidp_init_sockets(); } static void __exit hidp_exit(void) { hidp_cleanup_sockets(); } module_init(hidp_init); module_exit(hidp_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_AUTHOR("David Herrmann <dh.herrmann@gmail.com>"); MODULE_DESCRIPTION("Bluetooth HIDP ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS("bt-proto-6");
25 1 38 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for the SMC module (socket related) * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef __SMC_H #define __SMC_H #include <linux/socket.h> #include <linux/types.h> #include <linux/compiler.h> /* __aligned */ #include <net/genetlink.h> #include <net/sock.h> #include "smc_ib.h" #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE_0 0 #define SMC_RELEASE_1 1 #define SMC_RELEASE SMC_RELEASE_1 /* the latest release version */ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ #define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; extern struct proto smc_proto6; extern struct smc_hashinfo smc_v4_hashinfo; extern struct smc_hashinfo smc_v6_hashinfo; int smc_hash_sk(struct sock *sk); void smc_unhash_sk(struct sock *sk); void smc_release_cb(struct sock *sk); int smc_release(struct socket *sock); int smc_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int smc_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags); int smc_accept(struct socket *sock, struct socket *new_sock, struct proto_accept_arg *arg); int smc_getname(struct socket *sock, struct sockaddr *addr, int peer); __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait); int smc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int smc_listen(struct socket *sock, int backlog); int smc_shutdown(struct socket *sock, int how); int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); /* smc sock initialization */ void smc_sk_init(struct net *net, struct sock *sk, int protocol); /* clcsock initialization */ int smc_create_clcsk(struct net *net, struct sock *sk, int family); #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif enum smc_state { /* possible states of an SMC socket */ SMC_ACTIVE = 1, SMC_INIT = 2, SMC_CLOSED = 7, SMC_LISTEN = 10, /* normal close */ SMC_PEERCLOSEWAIT1 = 20, SMC_PEERCLOSEWAIT2 = 21, SMC_APPFINCLOSEWAIT = 24, SMC_APPCLOSEWAIT1 = 22, SMC_APPCLOSEWAIT2 = 23, SMC_PEERFINCLOSEWAIT = 25, /* abnormal close */ SMC_PEERABORTWAIT = 26, SMC_PROCESSABORT = 27, }; enum smc_supplemental_features { SMC_SPF_EMULATED_ISM_DEV = 0, }; #define SMC_FEATURE_MASK \ (BIT(SMC_SPF_EMULATED_ISM_DEV)) struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ union { u8 type; #if defined(__BIG_ENDIAN_BITFIELD) struct { u8 llc_version:4, llc_type:4; }; #elif defined(__LITTLE_ENDIAN_BITFIELD) struct { u8 llc_type:4, llc_version:4; }; #endif }; } __aligned(1); struct smc_cdc_conn_state_flags { #if defined(__BIG_ENDIAN_BITFIELD) u8 peer_done_writing : 1; /* Sending done indicator */ u8 peer_conn_closed : 1; /* Peer connection closed indicator */ u8 peer_conn_abort : 1; /* Abnormal close indicator */ u8 reserved : 5; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 5; u8 peer_conn_abort : 1; u8 peer_conn_closed : 1; u8 peer_done_writing : 1; #endif }; struct smc_cdc_producer_flags { #if defined(__BIG_ENDIAN_BITFIELD) u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ u8 urg_data_pending : 1; /* Urgent Data Pending */ u8 urg_data_present : 1; /* Urgent Data Present */ u8 cons_curs_upd_req : 1; /* cursor update requested */ u8 failover_validation : 1;/* message replay due to failover */ u8 reserved : 3; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 3; u8 failover_validation : 1; u8 cons_curs_upd_req : 1; u8 urg_data_present : 1; u8 urg_data_pending : 1; u8 write_blocked : 1; #endif }; /* in host byte order */ union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ struct { u16 reserved; u16 wrap; /* window wrap sequence number */ u32 count; /* cursor (= offset) part */ }; #ifdef KERNEL_HAS_ATOMIC64 atomic64_t acurs; /* for atomic processing */ #else u64 acurs; /* for atomic processing */ #endif } __aligned(8); /* in host byte order, except for flag bitfields in network byte order */ struct smc_host_cdc_msg { /* Connection Data Control message */ struct smc_wr_rx_hdr common; /* .type = 0xFE */ u8 len; /* length = 44 */ u16 seqno; /* connection seq # */ u32 token; /* alert_token */ union smc_host_cursor prod; /* producer cursor */ union smc_host_cursor cons; /* consumer cursor, * piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ u8 reserved[18]; } __aligned(8); enum smc_urg_state { SMC_URG_VALID = 1, /* data present */ SMC_URG_NOTYET = 2, /* data pending */ SMC_URG_READ = 3, /* data was already read */ }; struct smc_mark_woken { bool woken; void *key; wait_queue_entry_t wait_entry; }; struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ atomic_t peer_rmbe_space;/* remaining free bytes in peer * rmbe */ int rtoken_idx; /* idx to peer RMB rkey/addr */ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ int rmbe_size_comp; /* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update */ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging * buffer for CDC msg send * .prod cf. TCP snd_nxt * .cons cf. TCP sends ack */ union smc_host_cursor local_tx_ctrl_fin; /* prod crsr - confirmed by peer */ union smc_host_cursor tx_curs_prep; /* tx - prepared data * snd_max..wmem_alloc */ union smc_host_cursor tx_curs_sent; /* tx - sent data * snd_nxt ? */ union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer * snd-wnd-begin ? */ atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe * - inc when post wqe, * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt * .cons cf. TCP snd_una */ union smc_host_cursor rx_curs_confirmed; /* confirmed to peer * source of snd_una ? */ union smc_host_cursor urg_curs; /* points at urgent byte */ enum smc_urg_state urg_state; bool urg_tx_pend; /* urgent data staged */ bool urg_rx_skip_pend; /* indicate urgent oob data * read, but previous regular * data still pending */ char urg_rx_byte; /* urgent byte */ bool tx_in_release_sock; /* flush pending tx data in * sock release_cb() */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ atomic_t splice_pending; /* number of spliced bytes * pending processing */ #ifndef KERNEL_HAS_ATOMIC64 spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ struct work_struct abort_work; /* abort the connection */ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 freed : 1; /* normal termination */ u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ union { struct sock sk; struct inet_sock icsk_inet; }; struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); /* original data_ready fct. */ void (*clcsk_write_space)(struct sock *sk); /* original write_space fct. */ void (*clcsk_error_report)(struct sock *sk); /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ atomic_t queued_smc_hs; /* queued smc handshakes */ struct inet_connection_sock_af_ops af_ops; const struct inet_connection_sock_af_ops *ori_af_ops; /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close * started, waiting for unsent * data to be sent */ u8 connect_nonblock : 1; /* non-blocking connect in * flight */ struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket * */ }; #define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk) static inline void smc_init_saved_callbacks(struct smc_sock *smc) { smc->clcsk_state_change = NULL; smc->clcsk_data_ready = NULL; smc->clcsk_write_space = NULL; smc->clcsk_error_report = NULL; } static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } /* save target_cb in saved_cb, and replace target_cb with new_cb */ static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), void (*new_cb)(struct sock *), void (**saved_cb)(struct sock *)) { /* only save once */ if (!*saved_cb) *saved_cb = *target_cb; *target_cb = new_cb; } /* restore target_cb to saved_cb, and reset saved_cb to NULL */ static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), void (**saved_cb)(struct sock *)) { if (!*saved_cb) return; *target_cb = *saved_cb; *saved_cb = NULL; } extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ #define SMC_SYSTEMID_LEN 8 extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ #define ntohll(x) be64_to_cpu(x) #define htonll(x) cpu_to_be64(x) /* convert an u32 value into network byte order, store it into a 3 byte field */ static inline void hton24(u8 *net, u32 host) { __be32 t; t = cpu_to_be32(host); memcpy(net, ((u8 *)&t) + 1, 3); } /* convert a received 3 byte field into host byte order*/ static inline u32 ntoh24(u8 *net) { __be32 t = 0; memcpy(((u8 *)&t) + 1, net, 3); return be32_to_cpu(t); } #ifdef CONFIG_XFRM static inline bool using_ipsec(struct smc_sock *smc) { return (smc->clcsock->sk->sk_policy[0] || smc->clcsock->sk->sk_policy[1]) ? true : false; } #else static inline bool using_ipsec(struct smc_sock *smc) { return false; } #endif struct smc_gidlist; struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); /* smc handshake limitation interface for netlink */ int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) { set_bit(flag, &sk->sk_flags); } #endif /* __SMC_H */
4784 4784 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 // SPDX-License-Identifier: GPL-2.0-only /* * ACPI device specific properties support. * * Copyright (C) 2014 - 2023, Intel Corporation * All rights reserved. * * Authors: Mika Westerberg <mika.westerberg@linux.intel.com> * Darren Hart <dvhart@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * Sakari Ailus <sakari.ailus@linux.intel.com> */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi.h> #include <linux/device.h> #include <linux/export.h> #include "internal.h" static int acpi_data_get_property_array(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj); /* * The GUIDs here are made equivalent to each other in order to avoid extra * complexity in the properties handling code, with the caveat that the * kernel will accept certain combinations of GUID and properties that are * not defined without a warning. For instance if any of the properties * from different GUID appear in a property list of another, it will be * accepted by the kernel. Firmware validation tools should catch these. * * References: * * [1] UEFI DSD Guide. * https://github.com/UEFI/DSD-Guide/blob/main/src/dsd-guide.adoc */ static const guid_t prp_guids[] = { /* ACPI _DSD device properties GUID [1]: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 */ GUID_INIT(0xdaffd814, 0x6eba, 0x4d8c, 0x8a, 0x91, 0xbc, 0x9b, 0xbf, 0x4a, 0xa3, 0x01), /* Hotplug in D3 GUID: 6211e2c0-58a3-4af3-90e1-927a4e0c55a4 */ GUID_INIT(0x6211e2c0, 0x58a3, 0x4af3, 0x90, 0xe1, 0x92, 0x7a, 0x4e, 0x0c, 0x55, 0xa4), /* External facing port GUID: efcc06cc-73ac-4bc3-bff0-76143807c389 */ GUID_INIT(0xefcc06cc, 0x73ac, 0x4bc3, 0xbf, 0xf0, 0x76, 0x14, 0x38, 0x07, 0xc3, 0x89), /* Thunderbolt GUID for IMR_VALID: c44d002f-69f9-4e7d-a904-a7baabdf43f7 */ GUID_INIT(0xc44d002f, 0x69f9, 0x4e7d, 0xa9, 0x04, 0xa7, 0xba, 0xab, 0xdf, 0x43, 0xf7), /* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */ GUID_INIT(0x6c501103, 0xc189, 0x4296, 0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d), /* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */ GUID_INIT(0x5025030f, 0x842f, 0x4ab4, 0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0), }; /* ACPI _DSD data subnodes GUID [1]: dbb8e3e6-5886-4ba6-8795-1319f52a966b */ static const guid_t ads_guid = GUID_INIT(0xdbb8e3e6, 0x5886, 0x4ba6, 0x87, 0x95, 0x13, 0x19, 0xf5, 0x2a, 0x96, 0x6b); /* ACPI _DSD data buffer GUID [1]: edb12dd0-363d-4085-a3d2-49522ca160c4 */ static const guid_t buffer_prop_guid = GUID_INIT(0xedb12dd0, 0x363d, 0x4085, 0xa3, 0xd2, 0x49, 0x52, 0x2c, 0xa1, 0x60, 0xc4); static bool acpi_enumerate_nondev_subnodes(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data, struct fwnode_handle *parent); static bool acpi_extract_properties(acpi_handle handle, union acpi_object *desc, struct acpi_device_data *data); static bool acpi_nondev_subnode_extract(union acpi_object *desc, acpi_handle handle, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { struct acpi_data_node *dn; acpi_handle scope = NULL; bool result; if (acpi_graph_ignore_port(handle)) return false; dn = kzalloc(sizeof(*dn), GFP_KERNEL); if (!dn) return false; dn->name = link->package.elements[0].string.pointer; fwnode_init(&dn->fwnode, &acpi_data_fwnode_ops); dn->parent = parent; INIT_LIST_HEAD(&dn->data.properties); INIT_LIST_HEAD(&dn->data.subnodes); /* * The scope for the completion of relative pathname segments and * subnode object lookup is the one of the namespace node (device) * containing the object that has returned the package. That is, it's * the scope of that object's parent device. */ if (handle) acpi_get_parent(handle, &scope); /* * Extract properties from the _DSD-equivalent package pointed to by * desc and use scope (if not NULL) for the completion of relative * pathname segments. * * The extracted properties will be held in the new data node dn. */ result = acpi_extract_properties(scope, desc, &dn->data); /* * Look for subnodes in the _DSD-equivalent package pointed to by desc * and create child nodes of dn if there are any. */ if (acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode)) result = true; if (!result) { kfree(dn); acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n"); return false; } /* * This will be NULL if the desc package is embedded in an outer * _DSD-equivalent package and its scope cannot be determined. */ dn->handle = handle; dn->data.pointer = desc; list_add_tail(&dn->sibling, list); return true; } static bool acpi_nondev_subnode_ok(acpi_handle scope, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; acpi_handle handle; acpi_status status; /* * If the scope is unknown, the _DSD-equivalent package being parsed * was embedded in an outer _DSD-equivalent package as a result of * direct evaluation of an object pointed to by a reference. In that * case, using a pathname as the target object pointer is invalid. */ if (!scope) return false; status = acpi_get_handle(scope, link->package.elements[1].string.pointer, &handle); if (ACPI_FAILURE(status)) return false; status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf, ACPI_TYPE_PACKAGE); if (ACPI_FAILURE(status)) return false; if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list, parent)) return true; ACPI_FREE(buf.pointer); return false; } static bool acpi_add_nondev_subnodes(acpi_handle scope, union acpi_object *links, struct list_head *list, struct fwnode_handle *parent) { bool ret = false; int i; /* * Every element in the links package is expected to represent a link * to a non-device node in a tree containing device-specific data. */ for (i = 0; i < links->package.count; i++) { union acpi_object *link, *desc; bool result; link = &links->package.elements[i]; /* Only two elements allowed. */ if (link->package.count != 2) continue; /* The first one (the key) must be a string. */ if (link->package.elements[0].type != ACPI_TYPE_STRING) continue; /* The second one (the target) may be a string or a package. */ switch (link->package.elements[1].type) { case ACPI_TYPE_STRING: /* * The string is expected to be a full pathname or a * pathname segment relative to the given scope. That * pathname is expected to point to an object returning * a package that contains _DSD-equivalent information. */ result = acpi_nondev_subnode_ok(scope, link, list, parent); break; case ACPI_TYPE_PACKAGE: /* * This happens when a reference is used in AML to * point to the target. Since the target is expected * to be a named object, a reference to it will cause it * to be avaluated in place and its return package will * be embedded in the links package at the location of * the reference. * * The target package is expected to contain _DSD- * equivalent information, but the scope in which it * is located in the original AML is unknown. Thus * it cannot contain pathname segments represented as * strings because there is no way to build full * pathnames out of them. */ acpi_handle_debug(scope, "subnode %s: Unknown scope\n", link->package.elements[0].string.pointer); desc = &link->package.elements[1]; result = acpi_nondev_subnode_extract(desc, NULL, link, list, parent); break; case ACPI_TYPE_LOCAL_REFERENCE: /* * It is not expected to see any local references in * the links package because referencing a named object * should cause it to be evaluated in place. */ acpi_handle_info(scope, "subnode %s: Unexpected reference\n", link->package.elements[0].string.pointer); fallthrough; default: result = false; break; } ret = ret || result; } return ret; } static bool acpi_enumerate_nondev_subnodes(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data, struct fwnode_handle *parent) { int i; /* Look for the ACPI data subnodes GUID. */ for (i = 0; i < desc->package.count; i += 2) { const union acpi_object *guid; union acpi_object *links; guid = &desc->package.elements[i]; links = &desc->package.elements[i + 1]; /* * The first element must be a GUID and the second one must be * a package. */ if (guid->type != ACPI_TYPE_BUFFER || guid->buffer.length != 16 || links->type != ACPI_TYPE_PACKAGE) break; if (!guid_equal((guid_t *)guid->buffer.pointer, &ads_guid)) continue; return acpi_add_nondev_subnodes(scope, links, &data->subnodes, parent); } return false; } static bool acpi_property_value_ok(const union acpi_object *value) { int j; /* * The value must be an integer, a string, a reference, or a package * whose every element must be an integer, a string, or a reference. */ switch (value->type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_LOCAL_REFERENCE: return true; case ACPI_TYPE_PACKAGE: for (j = 0; j < value->package.count; j++) switch (value->package.elements[j].type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_LOCAL_REFERENCE: continue; default: return false; } return true; } return false; } static bool acpi_properties_format_valid(const union acpi_object *properties) { int i; for (i = 0; i < properties->package.count; i++) { const union acpi_object *property; property = &properties->package.elements[i]; /* * Only two elements allowed, the first one must be a string and * the second one has to satisfy certain conditions. */ if (property->package.count != 2 || property->package.elements[0].type != ACPI_TYPE_STRING || !acpi_property_value_ok(&property->package.elements[1])) return false; } return true; } static void acpi_init_of_compatible(struct acpi_device *adev) { const union acpi_object *of_compatible; int ret; ret = acpi_data_get_property_array(&adev->data, "compatible", ACPI_TYPE_STRING, &of_compatible); if (ret) { ret = acpi_dev_get_property(adev, "compatible", ACPI_TYPE_STRING, &of_compatible); if (ret) { struct acpi_device *parent; parent = acpi_dev_parent(adev); if (parent && parent->flags.of_compatible_ok) goto out; return; } } adev->data.of_compatible = of_compatible; out: adev->flags.of_compatible_ok = 1; } static bool acpi_is_property_guid(const guid_t *guid) { int i; for (i = 0; i < ARRAY_SIZE(prp_guids); i++) { if (guid_equal(guid, &prp_guids[i])) return true; } return false; } struct acpi_device_properties * acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, union acpi_object *properties) { struct acpi_device_properties *props; props = kzalloc(sizeof(*props), GFP_KERNEL); if (props) { INIT_LIST_HEAD(&props->list); props->guid = guid; props->properties = properties; list_add_tail(&props->list, &data->properties); } return props; } static void acpi_nondev_subnode_tag(acpi_handle handle, void *context) { } static void acpi_untie_nondev_subnodes(struct acpi_device_data *data) { struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { if (!dn->handle) continue; acpi_detach_data(dn->handle, acpi_nondev_subnode_tag); acpi_untie_nondev_subnodes(&dn->data); } } static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data) { struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { acpi_status status; bool ret; if (!dn->handle) continue; status = acpi_attach_data(dn->handle, acpi_nondev_subnode_tag, dn); if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) { acpi_handle_err(dn->handle, "Can't tag data node\n"); return false; } ret = acpi_tie_nondev_subnodes(&dn->data); if (!ret) return ret; } return true; } static void acpi_data_add_buffer_props(acpi_handle handle, struct acpi_device_data *data, union acpi_object *properties) { struct acpi_device_properties *props; union acpi_object *package; size_t alloc_size; unsigned int i; u32 *count; if (check_mul_overflow((size_t)properties->package.count, sizeof(*package) + sizeof(void *), &alloc_size) || check_add_overflow(sizeof(*props) + sizeof(*package), alloc_size, &alloc_size)) { acpi_handle_warn(handle, "can't allocate memory for %u buffer props", properties->package.count); return; } props = kvzalloc(alloc_size, GFP_KERNEL); if (!props) return; props->guid = &buffer_prop_guid; props->bufs = (void *)(props + 1); props->properties = (void *)(props->bufs + properties->package.count); /* Outer package */ package = props->properties; package->type = ACPI_TYPE_PACKAGE; package->package.elements = package + 1; count = &package->package.count; *count = 0; /* Inner packages */ package++; for (i = 0; i < properties->package.count; i++) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; union acpi_object *property = &properties->package.elements[i]; union acpi_object *prop, *obj, *buf_obj; acpi_status status; if (property->type != ACPI_TYPE_PACKAGE || property->package.count != 2) { acpi_handle_warn(handle, "buffer property %u has %u entries\n", i, property->package.count); continue; } prop = &property->package.elements[0]; obj = &property->package.elements[1]; if (prop->type != ACPI_TYPE_STRING || obj->type != ACPI_TYPE_STRING) { acpi_handle_warn(handle, "wrong object types %u and %u\n", prop->type, obj->type); continue; } status = acpi_evaluate_object_typed(handle, obj->string.pointer, NULL, &buf, ACPI_TYPE_BUFFER); if (ACPI_FAILURE(status)) { acpi_handle_warn(handle, "can't evaluate \"%*pE\" as buffer\n", obj->string.length, obj->string.pointer); continue; } package->type = ACPI_TYPE_PACKAGE; package->package.elements = prop; package->package.count = 2; buf_obj = buf.pointer; /* Replace the string object with a buffer object */ obj->type = ACPI_TYPE_BUFFER; obj->buffer.length = buf_obj->buffer.length; obj->buffer.pointer = buf_obj->buffer.pointer; props->bufs[i] = buf.pointer; package++; (*count)++; } if (*count) list_add(&props->list, &data->properties); else kvfree(props); } static bool acpi_extract_properties(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data) { int i; if (desc->package.count % 2) return false; /* Look for the device properties GUID. */ for (i = 0; i < desc->package.count; i += 2) { const union acpi_object *guid; union acpi_object *properties; guid = &desc->package.elements[i]; properties = &desc->package.elements[i + 1]; /* * The first element must be a GUID and the second one must be * a package. */ if (guid->type != ACPI_TYPE_BUFFER || guid->buffer.length != 16 || properties->type != ACPI_TYPE_PACKAGE) break; if (guid_equal((guid_t *)guid->buffer.pointer, &buffer_prop_guid)) { acpi_data_add_buffer_props(scope, data, properties); continue; } if (!acpi_is_property_guid((guid_t *)guid->buffer.pointer)) continue; /* * We found the matching GUID. Now validate the format of the * package immediately following it. */ if (!acpi_properties_format_valid(properties)) continue; acpi_data_add_props(data, (const guid_t *)guid->buffer.pointer, properties); } return !list_empty(&data->properties); } void acpi_init_properties(struct acpi_device *adev) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; struct acpi_hardware_id *hwid; acpi_status status; bool acpi_of = false; INIT_LIST_HEAD(&adev->data.properties); INIT_LIST_HEAD(&adev->data.subnodes); if (!adev->handle) return; /* * Check if ACPI_DT_NAMESPACE_HID is present and inthat case we fill in * Device Tree compatible properties for this device. */ list_for_each_entry(hwid, &adev->pnp.ids, list) { if (!strcmp(hwid->id, ACPI_DT_NAMESPACE_HID)) { acpi_of = true; break; } } status = acpi_evaluate_object_typed(adev->handle, "_DSD", NULL, &buf, ACPI_TYPE_PACKAGE); if (ACPI_FAILURE(status)) goto out; if (acpi_extract_properties(adev->handle, buf.pointer, &adev->data)) { adev->data.pointer = buf.pointer; if (acpi_of) acpi_init_of_compatible(adev); } if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer, &adev->data, acpi_fwnode_handle(adev))) adev->data.pointer = buf.pointer; if (!adev->data.pointer) { acpi_handle_debug(adev->handle, "Invalid _DSD data, skipping\n"); ACPI_FREE(buf.pointer); } else { if (!acpi_tie_nondev_subnodes(&adev->data)) acpi_untie_nondev_subnodes(&adev->data); } out: if (acpi_of && !adev->flags.of_compatible_ok) acpi_handle_info(adev->handle, ACPI_DT_NAMESPACE_HID " requires 'compatible' property\n"); if (!adev->data.pointer) acpi_extract_apple_properties(adev); } static void acpi_free_device_properties(struct list_head *list) { struct acpi_device_properties *props, *tmp; list_for_each_entry_safe(props, tmp, list, list) { u32 i; list_del(&props->list); /* Buffer data properties were separately allocated */ if (props->bufs) for (i = 0; i < props->properties->package.count; i++) ACPI_FREE(props->bufs[i]); kvfree(props); } } static void acpi_destroy_nondev_subnodes(struct list_head *list) { struct acpi_data_node *dn, *next; if (list_empty(list)) return; list_for_each_entry_safe_reverse(dn, next, list, sibling) { acpi_destroy_nondev_subnodes(&dn->data.subnodes); wait_for_completion(&dn->kobj_done); list_del(&dn->sibling); ACPI_FREE((void *)dn->data.pointer); acpi_free_device_properties(&dn->data.properties); kfree(dn); } } void acpi_free_properties(struct acpi_device *adev) { acpi_untie_nondev_subnodes(&adev->data); acpi_destroy_nondev_subnodes(&adev->data.subnodes); ACPI_FREE((void *)adev->data.pointer); adev->data.of_compatible = NULL; adev->data.pointer = NULL; acpi_free_device_properties(&adev->data.properties); } /** * acpi_data_get_property - return an ACPI property with given name * @data: ACPI device deta object to get the property from * @name: Name of the property * @type: Expected property type * @obj: Location to store the property value (if not %NULL) * * Look up a property with @name and store a pointer to the resulting ACPI * object at the location pointed to by @obj if found. * * Callers must not attempt to free the returned objects. These objects will be * freed by the ACPI core automatically during the removal of @data. * * Return: %0 if property with @name has been found (success), * %-EINVAL if the arguments are invalid, * %-EINVAL if the property doesn't exist, * %-EPROTO if the property value type doesn't match @type. */ static int acpi_data_get_property(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj) { const struct acpi_device_properties *props; if (!data || !name) return -EINVAL; if (!data->pointer || list_empty(&data->properties)) return -EINVAL; list_for_each_entry(props, &data->properties, list) { const union acpi_object *properties; unsigned int i; properties = props->properties; for (i = 0; i < properties->package.count; i++) { const union acpi_object *propname, *propvalue; const union acpi_object *property; property = &properties->package.elements[i]; propname = &property->package.elements[0]; propvalue = &property->package.elements[1]; if (!strcmp(name, propname->string.pointer)) { if (type != ACPI_TYPE_ANY && propvalue->type != type) return -EPROTO; if (obj) *obj = propvalue; return 0; } } } return -EINVAL; } /** * acpi_dev_get_property - return an ACPI property with given name. * @adev: ACPI device to get the property from. * @name: Name of the property. * @type: Expected property type. * @obj: Location to store the property value (if not %NULL). */ int acpi_dev_get_property(const struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj) { return adev ? acpi_data_get_property(&adev->data, name, type, obj) : -EINVAL; } EXPORT_SYMBOL_GPL(acpi_dev_get_property); static const struct acpi_device_data * acpi_device_data_of_node(const struct fwnode_handle *fwnode) { if (is_acpi_device_node(fwnode)) { const struct acpi_device *adev = to_acpi_device_node(fwnode); return &adev->data; } if (is_acpi_data_node(fwnode)) { const struct acpi_data_node *dn = to_acpi_data_node(fwnode); return &dn->data; } return NULL; } /** * acpi_node_prop_get - return an ACPI property with given name. * @fwnode: Firmware node to get the property from. * @propname: Name of the property. * @valptr: Location to store a pointer to the property value (if not %NULL). */ int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr) { return acpi_data_get_property(acpi_device_data_of_node(fwnode), propname, ACPI_TYPE_ANY, (const union acpi_object **)valptr); } /** * acpi_data_get_property_array - return an ACPI array property with given name * @data: ACPI data object to get the property from * @name: Name of the property * @type: Expected type of array elements * @obj: Location to store a pointer to the property value (if not NULL) * * Look up an array property with @name and store a pointer to the resulting * ACPI object at the location pointed to by @obj if found. * * Callers must not attempt to free the returned objects. Those objects will be * freed by the ACPI core automatically during the removal of @data. * * Return: %0 if array property (package) with @name has been found (success), * %-EINVAL if the arguments are invalid, * %-EINVAL if the property doesn't exist, * %-EPROTO if the property is not a package or the type of its elements * doesn't match @type. */ static int acpi_data_get_property_array(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj) { const union acpi_object *prop; int ret, i; ret = acpi_data_get_property(data, name, ACPI_TYPE_PACKAGE, &prop); if (ret) return ret; if (type != ACPI_TYPE_ANY) { /* Check that all elements are of correct type. */ for (i = 0; i < prop->package.count; i++) if (prop->package.elements[i].type != type) return -EPROTO; } if (obj) *obj = prop; return 0; } static struct fwnode_handle * acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { struct fwnode_handle *child; fwnode_for_each_child_node(fwnode, child) { if (is_acpi_data_node(child)) { if (acpi_data_node_match(child, childname)) return child; continue; } if (!strncmp(acpi_device_bid(to_acpi_device_node(child)), childname, ACPI_NAMESEG_SIZE)) return child; } return NULL; } static unsigned int acpi_fwnode_get_args_count(struct fwnode_handle *fwnode, const char *nargs_prop) { const struct acpi_device_data *data; const union acpi_object *obj; int ret; data = acpi_device_data_of_node(fwnode); if (!data) return 0; ret = acpi_data_get_property(data, nargs_prop, ACPI_TYPE_INTEGER, &obj); if (ret) return 0; return obj->integer.value; } static int acpi_get_ref_args(struct fwnode_reference_args *args, struct fwnode_handle *ref_fwnode, const char *nargs_prop, const union acpi_object **element, const union acpi_object *end, size_t num_args) { u32 nargs = 0, i; if (nargs_prop) num_args = acpi_fwnode_get_args_count(ref_fwnode, nargs_prop); /* * Assume the following integer elements are all args. Stop counting on * the first reference (possibly represented as a string) or end of the * package arguments. In case of neither reference, nor integer, return * an error, we can't parse it. */ for (i = 0; (*element) + i < end && i < num_args; i++) { acpi_object_type type = (*element)[i].type; if (type == ACPI_TYPE_LOCAL_REFERENCE || type == ACPI_TYPE_STRING) break; if (type == ACPI_TYPE_INTEGER) nargs++; else return -EINVAL; } if (nargs > NR_FWNODE_REFERENCE_ARGS) return -EINVAL; if (args) { args->fwnode = ref_fwnode; args->nargs = nargs; for (i = 0; i < nargs; i++) args->args[i] = (*element)[i].integer.value; } (*element) += nargs; return 0; } static struct fwnode_handle *acpi_parse_string_ref(const struct fwnode_handle *fwnode, const char *refstring) { acpi_handle scope, handle; struct acpi_data_node *dn; struct acpi_device *device; acpi_status status; if (is_acpi_device_node(fwnode)) { scope = to_acpi_device_node(fwnode)->handle; } else if (is_acpi_data_node(fwnode)) { scope = to_acpi_data_node(fwnode)->handle; } else { pr_debug("Bad node type for node %pfw\n", fwnode); return NULL; } status = acpi_get_handle(scope, refstring, &handle); if (ACPI_FAILURE(status)) { acpi_handle_debug(scope, "Unable to get an ACPI handle for %s\n", refstring); return NULL; } device = acpi_fetch_acpi_dev(handle); if (device) return acpi_fwnode_handle(device); status = acpi_get_data_full(handle, acpi_nondev_subnode_tag, (void **)&dn, NULL); if (ACPI_FAILURE(status) || !dn) { acpi_handle_debug(handle, "Subnode not found\n"); return NULL; } return &dn->fwnode; } static int acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode, const char *propname, const char *nargs_prop, unsigned int args_count, unsigned int index, struct fwnode_reference_args *args) { const union acpi_object *element, *end; const union acpi_object *obj; const struct acpi_device_data *data; struct fwnode_handle *ref_fwnode; struct acpi_device *device; int ret, idx = 0; data = acpi_device_data_of_node(fwnode); if (!data) return -ENOENT; ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj); if (ret) return ret == -EINVAL ? -ENOENT : -EINVAL; switch (obj->type) { case ACPI_TYPE_LOCAL_REFERENCE: /* Plain single reference without arguments. */ if (index) return -ENOENT; device = acpi_fetch_acpi_dev(obj->reference.handle); if (!device) return -EINVAL; if (!args) return 0; args->fwnode = acpi_fwnode_handle(device); args->nargs = 0; return 0; case ACPI_TYPE_STRING: if (index) return -ENOENT; ref_fwnode = acpi_parse_string_ref(fwnode, obj->string.pointer); if (!ref_fwnode) return -EINVAL; args->fwnode = ref_fwnode; args->nargs = 0; return 0; case ACPI_TYPE_PACKAGE: /* * If it is not a single reference, then it is a package of * references, followed by number of ints as follows: * * Package () { REF, INT, REF, INT, INT } * * Here, REF may be either a local reference or a string. The * index argument is then used to determine which reference the * caller wants (along with the arguments). */ break; default: return -EINVAL; } if (index >= obj->package.count) return -ENOENT; element = obj->package.elements; end = element + obj->package.count; while (element < end) { switch (element->type) { case ACPI_TYPE_LOCAL_REFERENCE: device = acpi_fetch_acpi_dev(element->reference.handle); if (!device) return -EINVAL; element++; ret = acpi_get_ref_args(idx == index ? args : NULL, acpi_fwnode_handle(device), nargs_prop, &element, end, args_count); if (ret < 0) return ret; if (idx == index) return 0; break; case ACPI_TYPE_STRING: ref_fwnode = acpi_parse_string_ref(fwnode, element->string.pointer); if (!ref_fwnode) return -EINVAL; element++; ret = acpi_get_ref_args(idx == index ? args : NULL, ref_fwnode, nargs_prop, &element, end, args_count); if (ret < 0) return ret; if (idx == index) return 0; break; case ACPI_TYPE_INTEGER: if (idx == index) return -ENOENT; element++; break; default: return -EINVAL; } idx++; } return -ENOENT; } /** * __acpi_node_get_property_reference - returns handle to the referenced object * @fwnode: Firmware node to get the property from * @propname: Name of the property * @index: Index of the reference to return * @num_args: Maximum number of arguments after each reference * @args: Location to store the returned reference with optional arguments * (may be NULL) * * Find property with @name, verifify that it is a package containing at least * one object reference and if so, store the ACPI device object pointer to the * target object in @args->adev. If the reference includes arguments, store * them in the @args->args[] array. * * If there's more than one reference in the property value package, @index is * used to select the one to return. * * It is possible to leave holes in the property value set like in the * example below: * * Package () { * "cs-gpios", * Package () { * ^GPIO, 19, 0, 0, * ^GPIO, 20, 0, 0, * 0, * ^GPIO, 21, 0, 0, * } * } * * Calling this function with index %2 or index %3 return %-ENOENT. If the * property does not contain any more values %-ENOENT is returned. The NULL * entry must be single integer and preferably contain value %0. * * Return: %0 on success, negative error code on failure. */ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *propname, size_t index, size_t num_args, struct fwnode_reference_args *args) { return acpi_fwnode_get_reference_args(fwnode, propname, NULL, num_args, index, args); } EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference); static int acpi_data_prop_read_single(const struct acpi_device_data *data, const char *propname, enum dev_prop_type proptype, void *val) { const union acpi_object *obj; int ret = 0; if (proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) ret = acpi_data_get_property(data, propname, ACPI_TYPE_INTEGER, &obj); else if (proptype == DEV_PROP_STRING) ret = acpi_data_get_property(data, propname, ACPI_TYPE_STRING, &obj); if (ret) return ret; switch (proptype) { case DEV_PROP_U8: if (obj->integer.value > U8_MAX) return -EOVERFLOW; if (val) *(u8 *)val = obj->integer.value; break; case DEV_PROP_U16: if (obj->integer.value > U16_MAX) return -EOVERFLOW; if (val) *(u16 *)val = obj->integer.value; break; case DEV_PROP_U32: if (obj->integer.value > U32_MAX) return -EOVERFLOW; if (val) *(u32 *)val = obj->integer.value; break; case DEV_PROP_U64: if (val) *(u64 *)val = obj->integer.value; break; case DEV_PROP_STRING: if (val) *(char **)val = obj->string.pointer; return 1; default: return -EINVAL; } /* When no storage provided return number of available values */ return val ? 0 : 1; } #define acpi_copy_property_array_uint(items, val, nval) \ ({ \ typeof(items) __items = items; \ typeof(val) __val = val; \ typeof(nval) __nval = nval; \ size_t i; \ int ret = 0; \ \ for (i = 0; i < __nval; i++) { \ if (__items->type == ACPI_TYPE_BUFFER) { \ __val[i] = __items->buffer.pointer[i]; \ continue; \ } \ if (__items[i].type != ACPI_TYPE_INTEGER) { \ ret = -EPROTO; \ break; \ } \ if (__items[i].integer.value > _Generic(__val, \ u8 *: U8_MAX, \ u16 *: U16_MAX, \ u32 *: U32_MAX, \ u64 *: U64_MAX)) { \ ret = -EOVERFLOW; \ break; \ } \ \ __val[i] = __items[i].integer.value; \ } \ ret; \ }) static int acpi_copy_property_array_string(const union acpi_object *items, char **val, size_t nval) { int i; for (i = 0; i < nval; i++) { if (items[i].type != ACPI_TYPE_STRING) return -EPROTO; val[i] = items[i].string.pointer; } return nval; } static int acpi_data_prop_read(const struct acpi_device_data *data, const char *propname, enum dev_prop_type proptype, void *val, size_t nval) { const union acpi_object *obj; const union acpi_object *items; int ret; if (nval == 1 || !val) { ret = acpi_data_prop_read_single(data, propname, proptype, val); /* * The overflow error means that the property is there and it is * single-value, but its type does not match, so return. */ if (ret >= 0 || ret == -EOVERFLOW) return ret; /* * Reading this property as a single-value one failed, but its * value may still be represented as one-element array, so * continue. */ } ret = acpi_data_get_property_array(data, propname, ACPI_TYPE_ANY, &obj); if (ret && proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) ret = acpi_data_get_property(data, propname, ACPI_TYPE_BUFFER, &obj); if (ret) return ret; if (!val) { if (obj->type == ACPI_TYPE_BUFFER) return obj->buffer.length; return obj->package.count; } switch (proptype) { case DEV_PROP_STRING: break; default: if (obj->type == ACPI_TYPE_BUFFER) { if (nval > obj->buffer.length) return -EOVERFLOW; } else { if (nval > obj->package.count) return -EOVERFLOW; } break; } if (obj->type == ACPI_TYPE_BUFFER) { if (proptype != DEV_PROP_U8) return -EPROTO; items = obj; } else { items = obj->package.elements; } switch (proptype) { case DEV_PROP_U8: ret = acpi_copy_property_array_uint(items, (u8 *)val, nval); break; case DEV_PROP_U16: ret = acpi_copy_property_array_uint(items, (u16 *)val, nval); break; case DEV_PROP_U32: ret = acpi_copy_property_array_uint(items, (u32 *)val, nval); break; case DEV_PROP_U64: ret = acpi_copy_property_array_uint(items, (u64 *)val, nval); break; case DEV_PROP_STRING: nval = min_t(u32, nval, obj->package.count); if (nval == 0) return -ENODATA; ret = acpi_copy_property_array_string(items, (char **)val, nval); break; default: ret = -EINVAL; break; } return ret; } /** * acpi_node_prop_read - retrieve the value of an ACPI property with given name. * @fwnode: Firmware node to get the property from. * @propname: Name of the property. * @proptype: Expected property type. * @val: Location to store the property value (if not %NULL). * @nval: Size of the array pointed to by @val. * * If @val is %NULL, return the number of array elements comprising the value * of the property. Otherwise, read at most @nval values to the array at the * location pointed to by @val. */ static int acpi_node_prop_read(const struct fwnode_handle *fwnode, const char *propname, enum dev_prop_type proptype, void *val, size_t nval) { return acpi_data_prop_read(acpi_device_data_of_node(fwnode), propname, proptype, val, nval); } static int stop_on_next(struct acpi_device *adev, void *data) { struct acpi_device **ret_p = data; if (!*ret_p) { *ret_p = adev; return 1; } /* Skip until the "previous" object is found. */ if (*ret_p == adev) *ret_p = NULL; return 0; } /** * acpi_get_next_subnode - Return the next child node handle for a fwnode * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the device's child nodes or a null handle. */ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct acpi_device *adev = to_acpi_device_node(fwnode); if ((!child || is_acpi_device_node(child)) && adev) { struct acpi_device *child_adev = to_acpi_device_node(child); acpi_dev_for_each_child(adev, stop_on_next, &child_adev); if (child_adev) return acpi_fwnode_handle(child_adev); child = NULL; } if (!child || is_acpi_data_node(child)) { const struct acpi_data_node *data = to_acpi_data_node(fwnode); const struct list_head *head; struct list_head *next; struct acpi_data_node *dn; /* * We can have a combination of device and data nodes, e.g. with * hierarchical _DSD properties. Make sure the adev pointer is * restored before going through data nodes, otherwise we will * be looking for data_nodes below the last device found instead * of the common fwnode shared by device_nodes and data_nodes. */ adev = to_acpi_device_node(fwnode); if (adev) head = &adev->data.subnodes; else if (data) head = &data->data.subnodes; else return NULL; if (list_empty(head)) return NULL; if (child) { dn = to_acpi_data_node(child); next = dn->sibling.next; if (next == head) return NULL; dn = list_entry(next, struct acpi_data_node, sibling); } else { dn = list_first_entry(head, struct acpi_data_node, sibling); } return &dn->fwnode; } return NULL; } /* * acpi_get_next_present_subnode - Return the next present child node handle * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the device's child nodes or a null handle. * * Like acpi_get_next_subnode(), but the device nodes returned by * acpi_get_next_present_subnode() are guaranteed to be present. * * Returns: The fwnode handle of the next present sub-node. */ static struct fwnode_handle * acpi_get_next_present_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { do { child = acpi_get_next_subnode(fwnode, child); } while (is_acpi_device_node(child) && !acpi_device_is_present(to_acpi_device_node(child))); return child; } /** * acpi_node_get_parent - Return parent fwnode of this fwnode * @fwnode: Firmware node whose parent to get * * Returns parent node of an ACPI device or data firmware node or %NULL if * not available. */ static struct fwnode_handle * acpi_node_get_parent(const struct fwnode_handle *fwnode) { if (is_acpi_data_node(fwnode)) { /* All data nodes have parent pointer so just return that */ return to_acpi_data_node(fwnode)->parent; } if (is_acpi_device_node(fwnode)) { struct acpi_device *parent; parent = acpi_dev_parent(to_acpi_device_node(fwnode)); if (parent) return acpi_fwnode_handle(parent); } return NULL; } /* * Return true if the node is an ACPI graph node. Called on either ports * or endpoints. */ static bool is_acpi_graph_node(struct fwnode_handle *fwnode, const char *str) { unsigned int len = strlen(str); const char *name; if (!len || !is_acpi_data_node(fwnode)) return false; name = to_acpi_data_node(fwnode)->name; return (fwnode_property_present(fwnode, "reg") && !strncmp(name, str, len) && name[len] == '@') || fwnode_property_present(fwnode, str); } /** * acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node * @fwnode: Pointer to the parent firmware node * @prev: Previous endpoint node or %NULL to get the first * * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns * %NULL if there is no next endpoint or in case of error. In case of success * the next endpoint is returned. */ static struct fwnode_handle *acpi_graph_get_next_endpoint( const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { struct fwnode_handle *port = NULL; struct fwnode_handle *endpoint; if (!prev) { do { port = fwnode_get_next_child_node(fwnode, port); /* * The names of the port nodes begin with "port@" * followed by the number of the port node and they also * have a "reg" property that also has the number of the * port node. For compatibility reasons a node is also * recognised as a port node from the "port" property. */ if (is_acpi_graph_node(port, "port")) break; } while (port); } else { port = fwnode_get_parent(prev); } if (!port) return NULL; endpoint = fwnode_get_next_child_node(port, prev); while (!endpoint) { port = fwnode_get_next_child_node(fwnode, port); if (!port) break; if (is_acpi_graph_node(port, "port")) endpoint = fwnode_get_next_child_node(port, NULL); } /* * The names of the endpoint nodes begin with "endpoint@" followed by * the number of the endpoint node and they also have a "reg" property * that also has the number of the endpoint node. For compatibility * reasons a node is also recognised as an endpoint node from the * "endpoint" property. */ if (!is_acpi_graph_node(endpoint, "endpoint")) return NULL; return endpoint; } /** * acpi_graph_get_child_prop_value - Return a child with a given property value * @fwnode: device fwnode * @prop_name: The name of the property to look for * @val: the desired property value * * Return the port node corresponding to a given port number. Returns * the child node on success, NULL otherwise. */ static struct fwnode_handle *acpi_graph_get_child_prop_value( const struct fwnode_handle *fwnode, const char *prop_name, unsigned int val) { struct fwnode_handle *child; fwnode_for_each_child_node(fwnode, child) { u32 nr; if (fwnode_property_read_u32(child, prop_name, &nr)) continue; if (val == nr) return child; } return NULL; } /** * acpi_graph_get_remote_endpoint - Parses and returns remote end of an endpoint * @__fwnode: Endpoint firmware node pointing to a remote device * * Returns the remote endpoint corresponding to @__fwnode. NULL on error. */ static struct fwnode_handle * acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode) { struct fwnode_handle *fwnode; unsigned int port_nr, endpoint_nr; struct fwnode_reference_args args; int ret; memset(&args, 0, sizeof(args)); ret = acpi_node_get_property_reference(__fwnode, "remote-endpoint", 0, &args); if (ret) return NULL; /* Direct endpoint reference? */ if (!is_acpi_device_node(args.fwnode)) return args.nargs ? NULL : args.fwnode; /* * Always require two arguments with the reference: port and * endpoint indices. */ if (args.nargs != 2) return NULL; fwnode = args.fwnode; port_nr = args.args[0]; endpoint_nr = args.args[1]; fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr); return acpi_graph_get_child_prop_value(fwnode, "endpoint", endpoint_nr); } static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (!is_acpi_device_node(fwnode)) return true; return acpi_device_is_present(to_acpi_device_node(fwnode)); } static const void * acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, const struct device *dev) { return acpi_device_get_match_data(dev); } static bool acpi_fwnode_device_dma_supported(const struct fwnode_handle *fwnode) { return acpi_dma_supported(to_acpi_device_node(fwnode)); } static enum dev_dma_attr acpi_fwnode_device_get_dma_attr(const struct fwnode_handle *fwnode) { return acpi_get_dma_attr(to_acpi_device_node(fwnode)); } static bool acpi_fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { return !acpi_node_prop_get(fwnode, propname, NULL); } static int acpi_fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { enum dev_prop_type type; switch (elem_size) { case sizeof(u8): type = DEV_PROP_U8; break; case sizeof(u16): type = DEV_PROP_U16; break; case sizeof(u32): type = DEV_PROP_U32; break; case sizeof(u64): type = DEV_PROP_U64; break; default: return -ENXIO; } return acpi_node_prop_read(fwnode, propname, type, val, nval); } static int acpi_fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING, val, nval); } static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode) { const struct acpi_device *adev; struct fwnode_handle *parent; /* Is this the root node? */ parent = fwnode_get_parent(fwnode); if (!parent) return "\\"; fwnode_handle_put(parent); if (is_acpi_data_node(fwnode)) { const struct acpi_data_node *dn = to_acpi_data_node(fwnode); return dn->name; } adev = to_acpi_device_node(fwnode); if (WARN_ON(!adev)) return NULL; return acpi_device_bid(adev); } static const char * acpi_fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; /* Is this the root node? */ parent = fwnode_get_parent(fwnode); if (!parent) return ""; /* Is this 2nd node from the root? */ parent = fwnode_get_next_parent(parent); if (!parent) return ""; fwnode_handle_put(parent); /* ACPI device or data node. */ return "."; } static struct fwnode_handle * acpi_fwnode_get_parent(struct fwnode_handle *fwnode) { return acpi_node_get_parent(fwnode); } static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode); endpoint->local_fwnode = fwnode; if (fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port)) fwnode_property_read_u32(port_fwnode, "port", &endpoint->port); if (fwnode_property_read_u32(fwnode, "reg", &endpoint->id)) fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id); return 0; } static int acpi_fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { struct resource res; int ret; ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), index, &res); if (ret) return ret; return res.start; } #define DECLARE_ACPI_FWNODE_OPS(ops) \ const struct fwnode_operations ops = { \ .device_is_available = acpi_fwnode_device_is_available, \ .device_get_match_data = acpi_fwnode_device_get_match_data, \ .device_dma_supported = \ acpi_fwnode_device_dma_supported, \ .device_get_dma_attr = acpi_fwnode_device_get_dma_attr, \ .property_present = acpi_fwnode_property_present, \ .property_read_bool = acpi_fwnode_property_present, \ .property_read_int_array = \ acpi_fwnode_property_read_int_array, \ .property_read_string_array = \ acpi_fwnode_property_read_string_array, \ .get_parent = acpi_node_get_parent, \ .get_next_child_node = acpi_get_next_present_subnode, \ .get_named_child_node = acpi_fwnode_get_named_child_node, \ .get_name = acpi_fwnode_get_name, \ .get_name_prefix = acpi_fwnode_get_name_prefix, \ .get_reference_args = acpi_fwnode_get_reference_args, \ .graph_get_next_endpoint = \ acpi_graph_get_next_endpoint, \ .graph_get_remote_endpoint = \ acpi_graph_get_remote_endpoint, \ .graph_get_port_parent = acpi_fwnode_get_parent, \ .graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \ .irq_get = acpi_fwnode_irq_get, \ }; \ EXPORT_SYMBOL_GPL(ops) DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops); DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops); const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_device_fwnode_ops; } EXPORT_SYMBOL(is_acpi_device_node); bool is_acpi_data_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops; } EXPORT_SYMBOL(is_acpi_data_node);
5 1 4 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel module to match various things tied to sockets associated with * locally generated outgoing packets. * * (C) 2000 Marc Boucher <marc@mbsi.ca> * * Copyright © CC Computer Consultants GmbH, 2007 - 2008 */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/file.h> #include <linux/cred.h> #include <net/sock.h> #include <net/inet_sock.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_owner.h> static int owner_check(const struct xt_mtchk_param *par) { struct xt_owner_match_info *info = par->matchinfo; struct net *net = par->net; if (info->match & ~XT_OWNER_MASK) return -EINVAL; /* Only allow the common case where the userns of the writer * matches the userns of the network namespace. */ if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && (current_user_ns() != net->user_ns)) return -EINVAL; /* Ensure the uids are valid */ if (info->match & XT_OWNER_UID) { kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if (!uid_valid(uid_min) || !uid_valid(uid_max) || (info->uid_max < info->uid_min) || uid_lt(uid_max, uid_min)) { return -EINVAL; } } /* Ensure the gids are valid */ if (info->match & XT_OWNER_GID) { kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); if (!gid_valid(gid_min) || !gid_valid(gid_max) || (info->gid_max < info->gid_min) || gid_lt(gid_max, gid_min)) { return -EINVAL; } } return 0; } static bool owner_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; struct sock *sk = skb_to_full_sk(skb); struct net *net = xt_net(par); if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk))) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* * Socket exists but user wanted ! --socket-exists. * (Single ampersands intended.) */ return false; read_lock_bh(&sk->sk_callback_lock); filp = sk->sk_socket ? sk->sk_socket->file : NULL; if (filp == NULL) { read_unlock_bh(&sk->sk_callback_lock); return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; } if (info->match & XT_OWNER_UID) { kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ !(info->invert & XT_OWNER_UID)) { read_unlock_bh(&sk->sk_callback_lock); return false; } } if (info->match & XT_OWNER_GID) { unsigned int i, match = false; kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); struct group_info *gi = filp->f_cred->group_info; if (gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) match = true; if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) { for (i = 0; i < gi->ngroups; ++i) { kgid_t group = gi->gid[i]; if (gid_gte(group, gid_min) && gid_lte(group, gid_max)) { match = true; break; } } } if (match ^ !(info->invert & XT_OWNER_GID)) { read_unlock_bh(&sk->sk_callback_lock); return false; } } read_unlock_bh(&sk->sk_callback_lock); return true; } static struct xt_match owner_mt_reg __read_mostly = { .name = "owner", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = owner_check, .match = owner_mt, .matchsize = sizeof(struct xt_owner_match_info), .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING), .me = THIS_MODULE, }; static int __init owner_mt_init(void) { return xt_register_match(&owner_mt_reg); } static void __exit owner_mt_exit(void) { xt_unregister_match(&owner_mt_reg); } module_init(owner_mt_init); module_exit(owner_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: socket owner matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_owner"); MODULE_ALIAS("ip6t_owner");
535 535 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright 2020 NXP */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gate.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_gate_ops; static ktime_t gate_get_time(struct tcf_gate *gact) { ktime_t mono = ktime_get(); switch (gact->tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, gact->tk_offset); } return KTIME_MAX; } static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) { struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; u64 n; base = ns_to_ktime(param->tcfg_basetime); now = gate_get_time(gact); if (ktime_after(base, now)) { *start = base; return; } cycle = param->tcfg_cycletime; n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) { ktime_t expires; expires = hrtimer_get_expires(&gact->hitimer); if (expires == 0) expires = KTIME_MAX; start = min_t(ktime_t, start, expires); hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); } static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); struct tcf_gate_params *p = &gact->param; struct tcfg_gate_entry *next; ktime_t close_time, now; spin_lock(&gact->tcf_lock); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; gact->current_entry_octets = 0; gact->current_max_octets = next->maxoctets; gact->current_close_time = ktime_add_ns(gact->current_close_time, next->interval); close_time = gact->current_close_time; if (list_is_last(&next->list, &p->entries)) next = list_first_entry(&p->entries, struct tcfg_gate_entry, list); else next = list_next_entry(next, list); now = gate_get_time(gact); if (ktime_after(now, close_time)) { ktime_t cycle, base; u64 n; cycle = p->tcfg_cycletime; base = ns_to_ktime(p->tcfg_basetime); n = div64_u64(ktime_sub_ns(now, base), cycle); close_time = ktime_add_ns(base, (n + 1) * cycle); } gact->next_entry = next; hrtimer_set_expires(&gact->hitimer, close_time); spin_unlock(&gact->tcf_lock); return HRTIMER_RESTART; } TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); int action = READ_ONCE(gact->tcf_action); tcf_lastuse_update(&gact->tcf_tm); tcf_action_update_bstats(&gact->common, skb); spin_lock(&gact->tcf_lock); if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { spin_unlock(&gact->tcf_lock); return action; } if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) { spin_unlock(&gact->tcf_lock); goto drop; } if (gact->current_max_octets >= 0) { gact->current_entry_octets += qdisc_pkt_len(skb); if (gact->current_entry_octets > gact->current_max_octets) { spin_unlock(&gact->tcf_lock); goto overlimit; } } spin_unlock(&gact->tcf_lock); return action; overlimit: tcf_action_inc_overlimit_qstats(&gact->common); drop: tcf_action_inc_drop_qstats(&gact->common); return TC_ACT_SHOT; } static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, }; static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { [TCA_GATE_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)), [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, [TCA_GATE_FLAGS] = { .type = NLA_U32 }, [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, }; static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, struct netlink_ext_ack *extack) { u32 interval = 0; entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); if (tb[TCA_GATE_ENTRY_INTERVAL]) interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); if (interval == 0) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1); entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS], -1); return 0; } static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_gate_entry(tb, entry, extack); } static void release_entry_list(struct list_head *entries) { struct tcfg_gate_entry *entry, *e; list_for_each_entry_safe(entry, e, entries, list) { list_del(&entry->list); kfree(entry); } } static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) { struct tcfg_gate_entry *entry; struct nlattr *n; int err, rem; int i = 0; if (!list_attr) return -EINVAL; nla_for_each_nested(n, list_attr, rem) { if (nla_type(n) != TCA_GATE_ONE_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); err = -ENOMEM; goto release_list; } err = parse_gate_entry(n, entry, i, extack); if (err < 0) { kfree(entry); goto release_list; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; release_list: release_entry_list(&sched->entries); return err; } static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, enum tk_offsets tko, s32 clockid, bool do_init) { if (!do_init) { if (basetime == gact->param.tcfg_basetime && tko == gact->tk_offset && clockid == gact->param.tcfg_clockid) return; spin_unlock_bh(&gact->tcf_lock); hrtimer_cancel(&gact->hitimer); spin_lock_bh(&gact->tcf_lock); } gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); enum tk_offsets tk_offset = TK_OFFS_TAI; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; u64 cycletime = 0, basetime = 0; struct tcf_gate_params *p; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; int ret = 0, err; u32 gflags = 0; s32 prio = -1; ktime_t start; u32 index; if (!nla) return -EINVAL; err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); if (err < 0) return err; if (!tb[TCA_GATE_PARMS]) return -EINVAL; if (tb[TCA_GATE_CLOCKID]) { clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); return -EINVAL; } } parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; if (err && bind) return ACT_P_BOUND; if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_gate_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); if (tb[TCA_GATE_BASE_TIME]) basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); gact = to_gate(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&gact->param.entries); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&gact->tcf_lock); p = &gact->param; if (tb[TCA_GATE_CYCLE_TIME]) cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); if (err < 0) goto chain_put; } if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; if (!cycletime) { err = -EINVAL; goto chain_put; } } p->tcfg_cycletime = cycletime; if (tb[TCA_GATE_CYCLE_TIME_EXT]) p->tcfg_cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); gate_setup_timer(gact, basetime, tk_offset, clockid, ret == ACT_P_CREATED); p->tcfg_priority = prio; p->tcfg_flags = gflags; gate_get_start_time(gact, &start); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; gact->next_entry = list_first_entry(&p->entries, struct tcfg_gate_entry, list); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); gate_start_timer(gact, start); spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; chain_put: spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: /* action is not inserted in any list: it's safe to init hitimer * without taking tcf_lock. */ if (ret == ACT_P_CREATED) gate_setup_timer(gact, gact->param.tcfg_basetime, gact->tk_offset, gact->param.tcfg_clockid, true); tcf_idr_release(*a, bind); return err; } static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; p = &gact->param; hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } static int dumping_entry(struct sk_buff *skb, struct tcfg_gate_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) goto nla_put_failure; return nla_nest_end(skb, item); nla_put_failure: nla_nest_cancel(skb, item); return -1; } static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gate *gact = to_gate(a); struct tc_gate opt = { .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, }; struct tcfg_gate_entry *entry; struct tcf_gate_params *p; struct nlattr *entry_list; struct tcf_t t; spin_lock_bh(&gact->tcf_lock); opt.action = gact->tcf_action; p = &gact->param; if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, p->tcfg_basetime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, p->tcfg_cycletime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, p->tcfg_cycletime_ext, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) goto nla_put_failure; entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); if (!entry_list) goto nla_put_failure; list_for_each_entry(entry, &p->entries, list) { if (dumping_entry(skb, entry) < 0) goto nla_put_failure; } nla_nest_end(skb, entry_list); tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; spin_unlock_bh(&gact->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_gate *gact = to_gate(a); struct tcf_t *tm = &gact->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static size_t tcf_gate_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_gate)); } static void tcf_gate_entry_destructor(void *priv) { struct action_gate_entry *oe = priv; kfree(oe); } static int tcf_gate_get_entries(struct flow_action_entry *entry, const struct tc_action *act) { entry->gate.entries = tcf_gate_get_list(act); if (!entry->gate.entries) return -EINVAL; entry->destructor = tcf_gate_entry_destructor; entry->destructor_priv = entry->gate.entries; return 0; } static int tcf_gate_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { int err; if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_GATE; entry->gate.prio = tcf_gate_prio(act); entry->gate.basetime = tcf_gate_basetime(act); entry->gate.cycletime = tcf_gate_cycletime(act); entry->gate.cycletimeext = tcf_gate_cycletimeext(act); entry->gate.num_entries = tcf_gate_num_entries(act); err = tcf_gate_get_entries(entry, act); if (err) return err; *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_GATE; } return 0; } static struct tc_action_ops act_gate_ops = { .kind = "gate", .id = TCA_ID_GATE, .owner = THIS_MODULE, .act = tcf_gate_act, .dump = tcf_gate_dump, .init = tcf_gate_init, .cleanup = tcf_gate_cleanup, .stats_update = tcf_gate_stats_update, .get_fill_size = tcf_gate_get_fill_size, .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; MODULE_ALIAS_NET_ACT("gate"); static __net_init int gate_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); return tc_action_net_init(net, tn, &act_gate_ops); } static void __net_exit gate_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_gate_ops.net_id); } static struct pernet_operations gate_net_ops = { .init = gate_init_net, .exit_batch = gate_exit_net, .id = &act_gate_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init gate_init_module(void) { return tcf_register_action(&act_gate_ops, &gate_net_ops); } static void __exit gate_cleanup_module(void) { tcf_unregister_action(&act_gate_ops, &gate_net_ops); } module_init(gate_init_module); module_exit(gate_cleanup_module); MODULE_DESCRIPTION("TC gate action"); MODULE_LICENSE("GPL v2");
40 4 4 44 44 44 44 44 44 44 33 33 43 36 36 36 26 1 1 17 5 7 7 7 7 5 7 7 24 1 23 19 19 4 4 4 4 5 2 3 1 1 1 1 1 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_prio.c Simple 3-band priority "scheduler". * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: * Init -- EINVAL when opt undefined */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct prio_sched_data { int bands; struct tcf_proto __rcu *filter_list; struct tcf_block *block; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; static struct Qdisc * prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return q->queues[q->prio2band[band & TC_PRIO_MAX]]; } band = res.classid; } band = TC_H_MIN(band) - 1; if (band >= q->bands) return q->queues[q->prio2band[0]]; return q->queues[band]; } static int prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct Qdisc *qdisc; int ret; qdisc = prio_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *prio_peek(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc->ops->peek(qdisc); if (skb) return skb; } return NULL; } static struct sk_buff *prio_dequeue(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } } return NULL; } static void prio_reset(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); } static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt) { struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload opt = { .handle = sch->handle, .parent = sch->parent, }; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; if (qopt) { opt.command = TC_PRIO_REPLACE; opt.replace_params.bands = qopt->bands; memcpy(&opt.replace_params.priomap, qopt->priomap, TC_PRIO_MAX + 1); opt.replace_params.qstats = &sch->qstats; } else { opt.command = TC_PRIO_DESTROY; } return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, &opt); } static void prio_destroy(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) qdisc_put(q->queues[prio]); } static int prio_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *queues[TCQ_PRIO_BANDS]; int oldbands = q->bands, i; struct tc_prio_qopt *qopt; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < TCQ_MIN_PRIO_BANDS) return -EINVAL; for (i = 0; i <= TC_PRIO_MAX; i++) { if (qopt->priomap[i] >= qopt->bands) return -EINVAL; } /* Before commit, make sure we can allocate all new qdiscs */ for (i = oldbands; i < qopt->bands; i++) { queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (!queues[i]) { while (i > oldbands) qdisc_put(queues[--i]); return -ENOMEM; } } prio_offload(sch, qopt); sch_tree_lock(sch); q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) qdisc_purge_queue(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; if (q->queues[i] != &noop_qdisc) qdisc_hash_add(q->queues[i], true); } sch_tree_unlock(sch); for (i = q->bands; i < oldbands; i++) qdisc_put(q->queues[i]); return 0; } static int prio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); int err; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; return prio_tune(sch, opt, extack); } static int prio_dump_offload(struct Qdisc *sch) { struct tc_prio_qopt_offload hw_stats = { .command = TC_PRIO_STATS, .handle = sch->handle, .parent = sch->parent, { .stats = { .bstats = &sch->bstats, .qstats = &sch->qstats, }, }, }; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats); } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct prio_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_prio_qopt opt; int err; opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); err = prio_dump_offload(sch); if (err) goto nla_put_failure; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt_offload graft_offload; unsigned long band = arg - 1; if (!new) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, arg), extack); if (!new) new = &noop_qdisc; else qdisc_hash_add(new, true); } *old = qdisc_replace(sch, new, &q->queues[band]); graft_offload.handle = sch->handle; graft_offload.parent = sch->parent; graft_offload.graft_params.band = band; graft_offload.graft_params.child_handle = new->handle; graft_offload.command = TC_PRIO_GRAFT; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, TC_SETUP_QDISC_PRIO, &graft_offload, extack); return 0; } static struct Qdisc * prio_leaf(struct Qdisc *sch, unsigned long arg) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long prio_find(struct Qdisc *sch, u32 classid) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return prio_find(sch, classid); } static void prio_unbind(struct Qdisc *q, unsigned long cl) { } static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct prio_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl-1]->handle; return 0; } static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct prio_sched_data *q = qdisc_priv(sch); int prio; if (arg->stop) return; for (prio = 0; prio < q->bands; prio++) { if (!tc_qdisc_stats_dump(sch, prio + 1, arg)) break; } } static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops prio_class_ops = { .graft = prio_graft, .leaf = prio_leaf, .find = prio_find, .walk = prio_walk, .tcf_block = prio_tcf_block, .bind_tcf = prio_bind, .unbind_tcf = prio_unbind, .dump = prio_dump_class, .dump_stats = prio_dump_class_stats, }; static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &prio_class_ops, .id = "prio", .priv_size = sizeof(struct prio_sched_data), .enqueue = prio_enqueue, .dequeue = prio_dequeue, .peek = prio_peek, .init = prio_init, .reset = prio_reset, .destroy = prio_destroy, .change = prio_tune, .dump = prio_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("prio"); static int __init prio_module_init(void) { return register_qdisc(&prio_qdisc_ops); } static void __exit prio_module_exit(void) { unregister_qdisc(&prio_qdisc_ops); } module_init(prio_module_init) module_exit(prio_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Simple 3-band priority qdisc");
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* SPDX-License-Identifier: GPL-2.0 */ /* * x86 TSC related functions */ #ifndef _ASM_X86_TSC_H #define _ASM_X86_TSC_H #include <asm/asm.h> #include <asm/cpufeature.h> #include <asm/processor.h> #include <asm/msr.h> /** * rdtsc() - returns the current TSC without ordering constraints * * rdtsc() returns the result of RDTSC as a 64-bit integer. The * only ordering constraint it supplies is the ordering implied by * "asm volatile": it will put the RDTSC in the place you expect. The * CPU can and will speculatively execute that RDTSC, though, so the * results can be non-monotonic if compared on different CPUs. */ static __always_inline u64 rdtsc(void) { EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); return EAX_EDX_VAL(val, low, high); } /** * rdtsc_ordered() - read the current TSC in program order * * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. * It is ordered like a load to a global in-memory counter. It should * be impossible to observe non-monotonic rdtsc_unordered() behavior * across multiple CPUs as long as the TSC is synced. */ static __always_inline u64 rdtsc_ordered(void) { EAX_EDX_DECLARE_ARGS(val, low, high); /* * The RDTSC instruction is not ordered relative to memory * access. The Intel SDM and the AMD APM are both vague on this * point, but empirically an RDTSC instruction can be * speculatively executed before prior loads. An RDTSC * immediately after an appropriate barrier appears to be * ordered as a normal load, that is, it provides the same * ordering guarantees as reading from a global memory location * that some other imaginary CPU is updating continuously with a * time stamp. * * Thus, use the preferred barrier on the respective CPU, aiming for * RDTSCP as the default. */ asm volatile(ALTERNATIVE_2("rdtsc", "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, "rdtscp", X86_FEATURE_RDTSCP) : EAX_EDX_RET(val, low, high) /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ :: "ecx"); return EAX_EDX_VAL(val, low, high); } /* * Standard way to access the cycle counter. */ typedef unsigned long long cycles_t; extern unsigned int cpu_khz; extern unsigned int tsc_khz; extern void disable_TSC(void); static inline cycles_t get_cycles(void) { if (!IS_ENABLED(CONFIG_X86_TSC) && !cpu_feature_enabled(X86_FEATURE_TSC)) return 0; return rdtsc(); } #define get_cycles get_cycles extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern void mark_tsc_async_resets(char *reason); extern unsigned long native_calibrate_cpu_early(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; #ifdef CONFIG_X86_TSC extern bool tsc_async_resets; #else # define tsc_async_resets false #endif /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: */ #ifdef CONFIG_X86_TSC extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_target(void); #else static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } static inline void tsc_verify_tsc_adjust(bool resume) { } static inline void check_tsc_sync_target(void) { } #endif extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); unsigned long cpu_khz_from_msr(void); #endif /* _ASM_X86_TSC_H */
17 13 4 2 6 9 9 2 6 1 1 3 3 15 15 15 15 15 15 15 14 15 14 1 19 19 1 3 9 4 2 2 11 21 1 1 3 11 5 2 10 6 6 3 2 1 3 3 1 2 1 2 1 3 3 6 1 5 54 54 1 53 1 49 42 6 46 2 3 27 18 2 4 18 32 12 26 10 5 22 23 3 26 19 8 21 4 26 8 4 20 15 4 6 6 26 26 9 9 7 7 1 1 2 1 4 1 4 3 1 4 1 3 392 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support for IPv6 * * Copyright (c) 2012 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <linux/in6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include "l2tp_core.h" /* per-net private data for this module */ static unsigned int l2tp_ip6_net_id; struct l2tp_ip6_net { rwlock_t l2tp_ip6_lock; struct hlist_head l2tp_ip6_table; struct hlist_head l2tp_ip6_bind_table; }; struct l2tp_ip6_sock { /* inet_sock has to be the first member of l2tp_ip6_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; struct ipv6_pinfo inet6; }; static struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) { return (struct l2tp_ip6_sock *)sk; } static struct l2tp_ip6_net *l2tp_ip6_pernet(const struct net *net) { return net_generic(net, l2tp_ip6_net_id); } static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, const struct in6_addr *laddr, const struct in6_addr *raddr, int dif, u32 tunnel_id) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); struct sock *sk; sk_for_each_bound(sk, &pn->l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (sk_laddr && !ipv6_addr_any(sk_laddr) && !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr)) continue; if (!ipv6_addr_any(sk_raddr) && raddr && !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr)) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip6_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct l2tp_ip6_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct ipv6hdr *iph; pn = l2tp_ip6_pernet(net); if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_put(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); read_lock_bh(&pn->l2tp_ip6_lock); sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, inet6_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&pn->l2tp_ip6_lock); goto discard; } sock_hold(sk); read_unlock_bh(&pn->l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_put(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip6_hash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) { write_lock_bh(&pn->l2tp_ip6_lock); sk_add_node(sk, &pn->l2tp_ip6_table); write_unlock_bh(&pn->l2tp_ip6_lock); } return 0; } static void l2tp_ip6_unhash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) return; write_lock_bh(&pn->l2tp_ip6_lock); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); } static int l2tp_ip6_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip6_hash(sk); return 0; } static void l2tp_ip6_close(struct sock *sk, long timeout) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sk_common_release(sk); } static void l2tp_ip6_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel; lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); } } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); struct l2tp_ip6_net *pn; __be32 v4addr = 0; int bound_dev_if; int addr_type; int err; pn = l2tp_ip6_pernet(net); if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) return -EINVAL; addr_type = ipv6_addr_type(&addr->l2tp_addr); /* l2tp_ip6 sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; /* L2TP is point-point, not multicast */ if (addr_type & IPV6_ADDR_MULTICAST) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out_unlock; if (sk->sk_state != TCP_CLOSE) goto out_unlock; bound_dev_if = sk->sk_bound_dev_if; /* Check if the address belongs to the host. */ rcu_read_lock(); if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (addr_type & IPV6_ADDR_LINKLOCAL) { if (addr->l2tp_scope_id) bound_dev_if = addr->l2tp_scope_id; /* Binding to link-local address requires an * interface. */ if (!bound_dev_if) goto out_unlock_rcu; err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if); if (!dev) goto out_unlock_rcu; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->l2tp_addr, dev, 0)) goto out_unlock_rcu; } rcu_read_unlock(); write_lock_bh(&pn->l2tp_ip6_lock); if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&pn->l2tp_ip6_lock); err = -EADDRINUSE; goto out_unlock; } inet->inet_saddr = v4addr; inet->inet_rcv_saddr = v4addr; sk->sk_bound_dev_if = bound_dev_if; sk->sk_v6_rcv_saddr = addr->l2tp_addr; np->saddr = addr->l2tp_addr; l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sock_reset_flag(sk, SOCK_ZAPPED); release_sock(sk); return 0; out_unlock_rcu: rcu_read_unlock(); out_unlock: release_sock(sk); return err; } static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct in6_addr *daddr; int addr_type; int rc; struct l2tp_ip6_net *pn; if (addr_len < sizeof(*lsa)) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -EINVAL; if (addr_type & IPV6_ADDR_MAPPED) { daddr = &usin->sin6_addr; if (ipv4_is_multicast(daddr->s6_addr32[3])) return -EINVAL; } lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip6_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); write_unlock_bh(&pn->l2tp_ip6_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip6_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sock *sk = sock->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct l2tp_ip6_sock *lsk = l2tp_ip6_sk(sk); lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr = sk->sk_v6_daddr; if (inet6_test_bit(SNDFLOW, sk)) lsa->l2tp_flowinfo = np->flow_label; } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) lsa->l2tp_addr = np->saddr; else lsa->l2tp_addr = sk->sk_v6_rcv_saddr; lsa->l2tp_conn_id = lsk->conn_id; } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = READ_ONCE(sk->sk_bound_dev_if); return sizeof(*lsa); } static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } static int l2tp_ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; __be32 *transhdr = NULL; int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; transhdr = (__be32 *)skb_transport_header(skb); *transhdr = 0; err = ip6_push_pending_frames(sk); out: return err; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ int ulen; int err; /* Rough check on arithmetic overflow, * better check is made in ip6_append_data(). */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk_uid(sk); ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (lsa->l2tp_family && lsa->l2tp_family != AF_INET6) return -EAFNOSUPPORT; daddr = &lsa->l2tp_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && lsa->l2tp_scope_id && ipv6_addr_type(daddr) & IPV6_ADDR_LINKLOCAL) fl6.flowi6_oif = lsa->l2tp_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen | opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: lock_sock(sk); ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = l2tp_ip6_push_pending_frames(sk); release_sock(sk); done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (lsa) { lsa->l2tp_family = AF_INET6; lsa->l2tp_unused = 0; lsa->l2tp_addr = ipv6_hdr(skb)->saddr; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = inet6_iif(skb); *addr_len = sizeof(*lsa); } if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } static struct proto l2tp_ip6_prot = { .name = "L2TP/IPv6", .owner = THIS_MODULE, .init = l2tp_ip6_open, .close = l2tp_ip6_close, .bind = l2tp_ip6_bind, .connect = l2tp_ip6_connect, .disconnect = l2tp_ip6_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip6_destroy_sock, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, .poll = datagram_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw l2tp_ip6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip6_prot, .ops = &l2tp_ip6_ops, }; static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { .handler = l2tp_ip6_recv, }; static __net_init int l2tp_ip6_init_net(struct net *net) { struct l2tp_ip6_net *pn = net_generic(net, l2tp_ip6_net_id); rwlock_init(&pn->l2tp_ip6_lock); INIT_HLIST_HEAD(&pn->l2tp_ip6_table); INIT_HLIST_HEAD(&pn->l2tp_ip6_bind_table); return 0; } static __net_exit void l2tp_ip6_exit_net(struct net *net) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); write_lock_bh(&pn->l2tp_ip6_lock); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_table) != 0); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_bind_table) != 0); write_unlock_bh(&pn->l2tp_ip6_lock); } static struct pernet_operations l2tp_ip6_net_ops = { .init = l2tp_ip6_init_net, .exit = l2tp_ip6_exit_net, .id = &l2tp_ip6_net_id, .size = sizeof(struct l2tp_ip6_net), }; static int __init l2tp_ip6_init(void) { int err; pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n"); err = register_pernet_device(&l2tp_ip6_net_ops); if (err) goto out; err = proto_register(&l2tp_ip6_prot, 1); if (err != 0) goto out1; err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); if (err) goto out2; inet6_register_protosw(&l2tp_ip6_protosw); return 0; out2: proto_unregister(&l2tp_ip6_prot); out1: unregister_pernet_device(&l2tp_ip6_net_ops); out: return err; } static void __exit l2tp_ip6_exit(void) { inet6_unregister_protosw(&l2tp_ip6_protosw); inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip6_prot); unregister_pernet_device(&l2tp_ip6_net_ops); } module_init(l2tp_ip6_init); module_exit(l2tp_ip6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Chris Elston <celston@katalix.com>"); MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115);
6 55 25 1 70 41 29 70 16 14 2 65 65 65 27 27 15 5 3 2 2 7 18 1 7 19 8 6 2 6 1 1 6 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <net/psp.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, u32 rcv_nxt) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); if (unlikely(ao && seq < rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif WRITE_ONCE(tcptw->tw_rcv_nxt, seq); } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn, enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; /* Instead of dropping immediately, wait to see what value is * returned. We will accept a non psp-encapsulated syn in the * case where TCP_TW_SYN is returned. */ psp_drop = psp_twsk_rx_policy_check(tw, skb); tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ if (psp_drop) goto out_put; /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, rcv_nxt, rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, rcv_nxt); if (tmp_opt.saw_tstamp) { u64 ts = tcp_clock_ms(); WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (psp_drop) goto out_put; if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); WRITE_ONCE(tcptw->tw_ts_recent_stamp, ktime_get_seconds()); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; *tw_isn = isn; return TCP_TW_SYN; } if (psp_drop) goto out_put; if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { #ifdef CONFIG_TCP_MD5SIG const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ tcptw->tw_md5_key = NULL; if (!static_branch_unlikely(&tcp_md5_needed.key)) return; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); if (!tcptw->tw_md5_key) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; tcp_md5_add_sigpool(); } return; out_free: WARN_ON_ONCE(1); kfree(tcptw->tw_md5_key); tcptw->tw_md5_key = NULL; #endif } /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_timewait_sock *tw; tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; /* refreshed when we enter true TIME-WAIT state */ tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; #endif #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif tcp_time_wait_init(sk, tcptw); tcp_ao_time_wait(tcptw, tp); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) { kfree(twsk->tw_md5_key); static_branch_slow_dec_deferred(&tcp_md5_needed); tcp_md5_release_sigpool(); } } #endif tcp_ao_destroy_sock(sk, true); psp_twsk_assoc_free(inet_twsk(sk)); } void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } static void tcp_ecn_openreq_child(struct sock *sk, const struct request_sock *req, const struct sk_buff *skb) { const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); if (treq->accecn_ok) { tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : TCP_ECN_DISABLED); } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; const struct tcp_sock *oldtp; struct tcp_sock *newtp; u32 seq; if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; seq = treq->snt_isn + 1; newtp->snd_sml = newtp->snd_una = seq; WRITE_ONCE(newtp->snd_nxt, seq); newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = READ_ONCE(treq->txhash); newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { newtp->total_rto = req->num_timeout; newtp->undo_marker = treq->snt_isn; if (newtp->tcp_usec_ts) { newtp->retrans_stamp = treq->snt_synack; newtp->total_rto_time = (u32)(tcp_clock_us() - newtp->retrans_stamp) / USEC_PER_MSEC; } else { newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto_time = tcp_clock_ms() - newtp->retrans_stamp; } newtp->total_rto_recoveries = 1; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ #endif #ifdef CONFIG_TCP_AO newtp->ao_info = NULL; if (tcp_rsk_used_ao(req)) { struct tcp_ao_key *ao_key; ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); if (ao_key) newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); } #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results * * Note: If @fastopen is true, this can be called from process context. * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen, enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool tsecr_reject = false; bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; if (tmp_opt.rcv_tsecr) { if (inet_rsk(req)->tstamp_ok && !fastopen) tsecr_reject = !between(tmp_opt.rcv_tsecr, tcp_rsk(req)->snt_tsval_first, READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !tcp_rtx_synack(sk, req)) { unsigned long expires = jiffies; expires += reqsk_timeout(req, TCP_RTO_MAX); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* RFC793: "first check sequence number". */ if (paws_reject || tsecr_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); } else if (tsecr_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); } else { SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); } return NULL; } /* In sequence, PAWS is OK. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); tcp_rsk(req)->saw_accecn_opt = saw_opt; if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; tcp_rsk(req)->accecn_fail_mode |= fail_mode; } } /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; if (own_req && tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; /* record sk_napi_id and sk_rx_queue_mapping of child. */ sk_mark_napi_id_set(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return reason; } EXPORT_IPV6_MOD(tcp_child_process);
46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Dynamic loading of modules into the kernel. * * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996 * Rewritten again by Rusty Russell, 2002 */ #ifndef _LINUX_MODULE_H #define _LINUX_MODULE_H #include <linux/list.h> #include <linux/stat.h> #include <linux/buildid.h> #include <linux/compiler.h> #include <linux/cache.h> #include <linux/cleanup.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/elf.h> #include <linux/stringify.h> #include <linux/kobject.h> #include <linux/moduleparam.h> #include <linux/jump_label.h> #include <linux/export.h> #include <linux/rbtree_latch.h> #include <linux/error-injection.h> #include <linux/tracepoint-defs.h> #include <linux/srcu.h> #include <linux/static_call_types.h> #include <linux/dynamic_debug.h> #include <linux/percpu.h> #include <asm/module.h> #define MODULE_NAME_LEN __MODULE_NAME_LEN struct modversion_info { unsigned long crc; char name[MODULE_NAME_LEN]; }; struct module; struct exception_table_entry; struct module_kobject { struct kobject kobj; struct module *mod; struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; } __randomize_layout; struct module_attribute { struct attribute attr; ssize_t (*show)(const struct module_attribute *, struct module_kobject *, char *); ssize_t (*store)(const struct module_attribute *, struct module_kobject *, const char *, size_t count); void (*setup)(struct module *, const char *); int (*test)(struct module *); void (*free)(struct module *); }; struct module_version_attribute { struct module_attribute mattr; const char *module_name; const char *version; }; extern ssize_t __modver_version_show(const struct module_attribute *, struct module_kobject *, char *); extern const struct module_attribute module_uevent; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); extern void cleanup_module(void); #ifndef MODULE /** * module_init() - driver initialization entry point * @x: function to be run at kernel boot time or module insertion * * module_init() will either be called during do_initcalls() (if * builtin) or at module insertion time (if a module). There can only * be one per module. */ #define module_init(x) __initcall(x); /** * module_exit() - driver exit entry point * @x: function to be run when driver is removed * * module_exit() will wrap the driver clean-up code * with cleanup_module() when used with rmmod when * the driver is a module. If the driver is statically * compiled into the kernel, module_exit() has no effect. * There can only be one per module. */ #define module_exit(x) __exitcall(x); #else /* MODULE */ /* * In most cases loadable modules do not need custom * initcall levels. There are still some valid cases where * a driver may be needed early if built in, and does not * matter when built as a loadable module. Like bus * snooping debug drivers. */ #define early_initcall(fn) module_init(fn) #define core_initcall(fn) module_init(fn) #define core_initcall_sync(fn) module_init(fn) #define postcore_initcall(fn) module_init(fn) #define postcore_initcall_sync(fn) module_init(fn) #define arch_initcall(fn) module_init(fn) #define subsys_initcall(fn) module_init(fn) #define subsys_initcall_sync(fn) module_init(fn) #define fs_initcall(fn) module_init(fn) #define fs_initcall_sync(fn) module_init(fn) #define rootfs_initcall(fn) module_init(fn) #define device_initcall(fn) module_init(fn) #define device_initcall_sync(fn) module_init(fn) #define late_initcall(fn) module_init(fn) #define late_initcall_sync(fn) module_init(fn) #define console_initcall(fn) module_init(fn) /* Each module must use one module_init(). */ #define module_init(initfn) \ static inline initcall_t __maybe_unused __inittest(void) \ { return initfn; } \ int init_module(void) __copy(initfn) \ __attribute__((alias(#initfn))); \ ___ADDRESSABLE(init_module, __initdata); /* This is only required if you want to be unloadable. */ #define module_exit(exitfn) \ static inline exitcall_t __maybe_unused __exittest(void) \ { return exitfn; } \ void cleanup_module(void) __copy(exitfn) \ __attribute__((alias(#exitfn))); \ ___ADDRESSABLE(cleanup_module, __exitdata); #endif /* This means "can be init if no module support, otherwise module load may call it." */ #ifdef CONFIG_MODULES #define __init_or_module #define __initdata_or_module #define __initconst_or_module #define __INIT_OR_MODULE .text #define __INITDATA_OR_MODULE .data #define __INITRODATA_OR_MODULE .section ".rodata","a",%progbits #else #define __init_or_module __init #define __initdata_or_module __initdata #define __initconst_or_module __initconst #define __INIT_OR_MODULE __INIT #define __INITDATA_OR_MODULE __INITDATA #define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ struct module_kobject *lookup_or_create_module_kobject(const char *name); /* For userspace: you can also call me... */ #define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias) /* Soft module dependencies. See man modprobe.d for details. * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz") */ #define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep) /* * Weak module dependencies. See man modprobe.d for details. * Example: MODULE_WEAKDEP("module-foo") */ #define MODULE_WEAKDEP(_weakdep) MODULE_INFO(weakdep, _weakdep) /* * MODULE_FILE is used for generating modules.builtin * So, make it no-op when this is being built as a module */ #ifdef MODULE #define MODULE_FILE #else #define MODULE_FILE MODULE_INFO(file, KBUILD_MODFILE); #endif /* * The following license idents are currently accepted as indicating free * software modules * * "GPL" [GNU Public License v2] * "GPL v2" [GNU Public License v2] * "GPL and additional rights" [GNU Public License v2 rights and more] * "Dual BSD/GPL" [GNU Public License v2 * or BSD license choice] * "Dual MIT/GPL" [GNU Public License v2 * or MIT license choice] * "Dual MPL/GPL" [GNU Public License v2 * or Mozilla license choice] * * The following other idents are available * * "Proprietary" [Non free products] * * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are * merely stating that the module is licensed under the GPL v2, but are not * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there * are two variants is a historic and failed attempt to convey more * information in the MODULE_LICENSE string. For module loading the * "only/or later" distinction is completely irrelevant and does neither * replace the proper license identifiers in the corresponding source file * nor amends them in any way. The sole purpose is to make the * 'Proprietary' flagging work and to refuse to bind symbols which are * exported with EXPORT_SYMBOL_GPL when a non free module is loaded. * * In the same way "BSD" is not a clear license information. It merely * states, that the module is licensed under one of the compatible BSD * license variants. The detailed and correct license information is again * to be found in the corresponding source files. * * There are dual licensed components, but when running with Linux it is the * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL * is a GPL combined work. * * This exists for several reasons * 1. So modinfo can show license info for users wanting to vet their setup * is free * 2. So the community can ignore bug reports including proprietary modules * 3. So vendors can do likewise based on their own policies */ #define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license) /* * Author(s), use "Name <email>" or just "Name", for multiple * authors use multiple MODULE_AUTHOR() statements/lines. */ #define MODULE_AUTHOR(_author) MODULE_INFO(author, _author) /* What your module does. */ #define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) /* * Format: __mod_device_table__kmod_<modname>__<type>__<name> * Parts of the string `__kmod_` and `__` are used as delimiters when parsing * a symbol in file2alias.c */ #define __mod_device_table(type, name) \ __PASTE(__mod_device_table__, \ __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(type, \ __PASTE(__, name)))))) /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ static typeof(name) __mod_device_table(type, name) \ __attribute__ ((used, alias(__stringify(name)))) /* Version of form [<epoch>:]<version>[-<extra-version>]. * Or for CVS/RCS ID version, everything but the number is stripped. * <epoch>: A (small) unsigned integer which allows you to start versions * anew. If not mentioned, it's zero. eg. "2:1.0" is after * "1:2.0". * <version>: The <version> may contain only alphanumerics and the * character `.'. Ordered by numeric sort for numeric parts, * ascii sort for ascii parts (as per RPM or DEB algorithm). * <extraversion>: Like <version>, but inserted for local * customizations, eg "rh3" or "rusty1". * Using this automatically adds a checksum of the .c files and the * local headers in "srcversion". */ #if defined(MODULE) || !defined(CONFIG_SYSFS) #define MODULE_VERSION(_version) MODULE_INFO(version, _version) #else #define MODULE_VERSION(_version) \ MODULE_INFO(version, _version); \ static const struct module_version_attribute __modver_attr \ __used __section("__modver") \ __aligned(__alignof__(struct module_version_attribute)) \ = { \ .mattr = { \ .attr = { \ .name = "version", \ .mode = S_IRUGO, \ }, \ .show = __modver_version_show, \ }, \ .module_name = KBUILD_MODNAME, \ .version = _version, \ } #endif /* Optional firmware file (or files) needed by the module * format is simply firmware file name. Multiple firmware * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) #define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, ns) struct notifier_block; enum module_state { MODULE_STATE_LIVE, /* Normal state. */ MODULE_STATE_COMING, /* Full formed, running module_init. */ MODULE_STATE_GOING, /* Going away. */ MODULE_STATE_UNFORMED, /* Still setting it up. */ }; struct mod_tree_node { struct module *mod; struct latch_tree_node node; }; enum mod_mem_type { MOD_TEXT = 0, MOD_DATA, MOD_RODATA, MOD_RO_AFTER_INIT, MOD_INIT_TEXT, MOD_INIT_DATA, MOD_INIT_RODATA, MOD_MEM_NUM_TYPES, MOD_INVALID = -1, }; #define mod_mem_type_is_init(type) \ ((type) == MOD_INIT_TEXT || \ (type) == MOD_INIT_DATA || \ (type) == MOD_INIT_RODATA) #define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type)) #define mod_mem_type_is_text(type) \ ((type) == MOD_TEXT || \ (type) == MOD_INIT_TEXT) #define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type)) #define mod_mem_type_is_core_data(type) \ (mod_mem_type_is_core(type) && \ mod_mem_type_is_data(type)) #define for_each_mod_mem_type(type) \ for (enum mod_mem_type (type) = 0; \ (type) < MOD_MEM_NUM_TYPES; (type)++) #define for_class_mod_mem_type(type, class) \ for_each_mod_mem_type(type) \ if (mod_mem_type_is_##class(type)) struct module_memory { void *base; bool is_rox; unsigned int size; #ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_node mtn; #endif }; #ifdef CONFIG_MODULES_TREE_LOOKUP /* Only touch one cacheline for common rbtree-for-core-layout case. */ #define __module_memory_align ____cacheline_aligned #else #define __module_memory_align #endif struct mod_kallsyms { Elf_Sym *symtab; unsigned int num_symtab; char *strtab; char *typetab; }; #ifdef CONFIG_LIVEPATCH /** * struct klp_modinfo - ELF information preserved from the livepatch module * * @hdr: ELF header * @sechdrs: Section header table * @secstrings: String table for the section headers * @symndx: The symbol table section index */ struct klp_modinfo { Elf_Ehdr hdr; Elf_Shdr *sechdrs; char *secstrings; unsigned int symndx; }; #endif struct module { enum module_state state; /* Member of list of modules */ struct list_head list; /* Unique handle for this module */ char name[MODULE_NAME_LEN]; #ifdef CONFIG_STACKTRACE_BUILD_ID /* Module build ID */ unsigned char build_id[BUILD_ID_SIZE_MAX]; #endif /* Sysfs stuff. */ struct module_kobject mkobj; struct module_attribute *modinfo_attrs; const char *version; const char *srcversion; struct kobject *holders_dir; /* Exported symbols */ const struct kernel_symbol *syms; const u32 *crcs; unsigned int num_syms; #ifdef CONFIG_ARCH_USES_CFI_TRAPS s32 *kcfi_traps; s32 *kcfi_traps_end; #endif /* Kernel parameters. */ #ifdef CONFIG_SYSFS struct mutex param_lock; #endif struct kernel_param *kp; unsigned int num_kp; /* GPL-only exported symbols. */ unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; const u32 *gpl_crcs; bool using_gplonly_symbols; #ifdef CONFIG_MODULE_SIG /* Signature was verified. */ bool sig_ok; #endif bool async_probe_requested; /* Exception table */ unsigned int num_exentries; struct exception_table_entry *extable; /* Startup function. */ int (*init)(void); struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align; /* Arch-specific module values */ struct mod_arch_specific arch; unsigned long taints; /* same bits as kernel:taint_flags */ #ifdef CONFIG_GENERIC_BUG /* Support for BUG */ unsigned num_bugs; struct list_head bug_list; struct bug_entry *bug_table; #endif #ifdef CONFIG_KALLSYMS /* Protected by RCU and/or module_mutex: use rcu_dereference() */ struct mod_kallsyms __rcu *kallsyms; struct mod_kallsyms core_kallsyms; /* Section attributes */ struct module_sect_attrs *sect_attrs; /* Notes attributes */ struct module_notes_attrs *notes_attrs; #endif /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; #ifdef CONFIG_SMP /* Per-cpu data. */ void __percpu *percpu; unsigned int percpu_size; #endif void *noinstr_text_start; unsigned int noinstr_text_size; #ifdef CONFIG_TRACEPOINTS unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif #ifdef CONFIG_TREE_SRCU unsigned int num_srcu_structs; struct srcu_struct **srcu_struct_ptrs; #endif #ifdef CONFIG_BPF_EVENTS unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif #ifdef CONFIG_DEBUG_INFO_BTF_MODULES unsigned int btf_data_size; unsigned int btf_base_data_size; void *btf_data; void *btf_base_data; #endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; #endif #ifdef CONFIG_TRACING unsigned int num_trace_bprintk_fmt; const char **trace_bprintk_fmt_start; #endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; struct trace_eval_map **trace_evals; unsigned int num_trace_evals; #endif #ifdef CONFIG_DYNAMIC_FTRACE unsigned int num_ftrace_callsites; unsigned long *ftrace_callsites; #endif #ifdef CONFIG_KPROBES void *kprobes_text_start; unsigned int kprobes_text_size; unsigned long *kprobe_blacklist; unsigned int num_kprobe_blacklist; #endif #ifdef CONFIG_HAVE_STATIC_CALL_INLINE int num_static_call_sites; struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) int num_kunit_init_suites; struct kunit_suite **kunit_init_suites; int num_kunit_suites; struct kunit_suite **kunit_suites; #endif #ifdef CONFIG_LIVEPATCH bool klp; /* Is this a livepatch module? */ bool klp_alive; /* ELF information */ struct klp_modinfo *klp_info; #endif #ifdef CONFIG_PRINTK_INDEX unsigned int printk_index_size; struct pi_entry **printk_index_start; #endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head source_list; /* What modules do I depend on? */ struct list_head target_list; /* Destruction function. */ void (*exit)(void); atomic_t refcnt; #endif #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; unsigned int num_ctors; #endif #ifdef CONFIG_FUNCTION_ERROR_INJECTION struct error_injection_entry *ei_funcs; unsigned int num_ei_funcs; #endif #ifdef CONFIG_DYNAMIC_DEBUG_CORE struct _ddebug_info dyndbg_info; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif #ifdef CONFIG_MODULES /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); #define symbol_get(x) ({ \ static const char __notrim[] \ __used __section(".no_trim_symbol") = __stringify(x); \ (typeof(&x))(__symbol_get(__stringify(x))); }) #ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) { return sym->st_value; } #endif /* FIXME: It'd be nice to isolate modules during init, too, so they aren't used before they (may) fail. But presently too much code (IDE & SCSI) require entry into the module during init.*/ static inline bool module_is_live(struct module *mod) { return mod->state != MODULE_STATE_GOING; } static inline bool module_is_coming(struct module *mod) { return mod->state == MODULE_STATE_COMING; } struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr); bool is_module_percpu_address(unsigned long addr); bool is_module_text_address(unsigned long addr); static inline bool within_module_mem_type(unsigned long addr, const struct module *mod, enum mod_mem_type type) { unsigned long base, size; base = (unsigned long)mod->mem[type].base; size = mod->mem[type].size; return addr - base < size; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, core) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, init) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return within_module_init(addr, mod) || within_module_core(addr, mod); } /* Search for module by name: must be in a RCU critical section. */ struct module *find_module(const char *name); extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD int module_refcount(struct module *mod); void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(__stringify(x)) void symbol_put_addr(void *addr); /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ extern void __module_get(struct module *module); /** * try_module_get() - take module refcount unless module is being removed * @module: the module we should check for * * Only try to get a module reference count if the module is not being removed. * This call will fail if the module is in the process of being removed. * * Care must also be taken to ensure the module exists and is alive prior to * usage of this call. This can be gauranteed through two means: * * 1) Direct protection: you know an earlier caller must have increased the * module reference through __module_get(). This can typically be achieved * by having another entity other than the module itself increment the * module reference count. * * 2) Implied protection: there is an implied protection against module * removal. An example of this is the implied protection used by kernfs / * sysfs. The sysfs store / read file operations are guaranteed to exist * through the use of kernfs's active reference (see kernfs_active()) and a * sysfs / kernfs file removal cannot happen unless the same file is not * active. Therefore, if a sysfs file is being read or written to the module * which created it must still exist. It is therefore safe to use * try_module_get() on module sysfs store / read ops. * * One of the real values to try_module_get() is the module_is_live() check * which ensures that the caller of try_module_get() can yield to userspace * module removal requests and gracefully fail if the module is on its way out. * * Returns true if the reference count was successfully incremented. */ extern bool try_module_get(struct module *module); /** * module_put() - release a reference count to a module * @module: the module we should release a reference count for * * If you successfully bump a reference count to a module with try_module_get(), * when you are finished you must call module_put() to release that reference * count. */ extern void module_put(struct module *module); #else /*!CONFIG_MODULE_UNLOAD*/ static inline bool try_module_get(struct module *module) { return !module || module_is_live(module); } static inline void module_put(struct module *module) { } static inline void __module_get(struct module *module) { } #define symbol_put(x) do { } while (0) #define symbol_put_addr(p) do { } while (0) #endif /* CONFIG_MODULE_UNLOAD */ /* This is a #define so the string doesn't get put in every .o file */ #define module_name(mod) \ ({ \ struct module *__mod = (mod); \ __mod ? __mod->name : "kernel"; \ }) /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); int register_module_notifier(struct notifier_block *nb); int unregister_module_notifier(struct notifier_block *nb); extern void print_modules(void); static inline bool module_requested_async_probing(struct module *module) { return module && module->async_probe_requested; } static inline bool is_livepatch_module(struct module *mod) { #ifdef CONFIG_LIVEPATCH return mod->klp; #else return false; #endif } void set_module_sig_enforced(void); void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data); #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) { return NULL; } static inline struct module *__module_text_address(unsigned long addr) { return NULL; } static inline bool is_module_address(unsigned long addr) { return false; } static inline bool is_module_percpu_address(unsigned long addr) { return false; } static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { return false; } static inline bool is_module_text_address(unsigned long addr) { return false; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return false; } /* Get/put a kernel symbol (calls should be symmetric) */ #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) #define symbol_put(x) do { } while (0) #define symbol_put_addr(x) do { } while (0) static inline void __module_get(struct module *module) { } static inline bool try_module_get(struct module *module) { return true; } static inline void module_put(struct module *module) { } #define module_name(mod) "kernel" static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ return 0; } static inline int unregister_module_notifier(struct notifier_block *nb) { return 0; } #define module_put_and_kthread_exit(code) kthread_exit(code) static inline void print_modules(void) { } static inline bool module_requested_async_probing(struct module *module) { return false; } static inline void set_module_sig_enforced(void) { } /* Dereference module function descriptor */ static inline void *dereference_module_function_descriptor(struct module *mod, void *ptr) { return ptr; } static inline bool module_is_coming(struct module *mod) { return false; } static inline void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data) { } #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS extern struct kset *module_kset; extern const struct kobj_type module_ktype; #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */ #define __MODULE_STRING(x) __stringify(x) #ifdef CONFIG_GENERIC_BUG void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); void module_bug_cleanup(struct module *); #else /* !CONFIG_GENERIC_BUG */ static inline void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { } static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ #ifdef CONFIG_MITIGATION_RETPOLINE extern bool retpoline_module_ok(bool has_retpoline); #else static inline bool retpoline_module_ok(bool has_retpoline) { return true; } #endif #ifdef CONFIG_MODULE_SIG bool is_module_sig_enforced(void); static inline bool module_sig_ok(struct module *module) { return module->sig_ok; } #else /* !CONFIG_MODULE_SIG */ static inline bool is_module_sig_enforced(void) { return false; } static inline bool module_sig_ok(struct module *module) { return true; } #endif /* CONFIG_MODULE_SIG */ #if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS) int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data); /* For kallsyms to ask for address resolution. namebuf should be at * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ int module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if * symnum out of range. */ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported); /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); #else /* CONFIG_MODULES && CONFIG_KALLSYMS */ static inline int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } /* For kallsyms to ask for address resolution. NULL means not found. */ static inline int module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { return 0; } static inline int lookup_module_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) { return -ERANGE; } static inline unsigned long module_kallsyms_lookup_name(const char *name) { return 0; } static inline unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { return 0; } #endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ /* Define __free(module_put) macro for struct module *. */ DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) #endif /* _LINUX_MODULE_H */
1 1 1 1 5 5 5 5 5 20 20 27 27 20 20 20 19 19 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #include <linux/hardirq.h> #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/tty.h> #include <linux/file.h> #include <linux/if_arp.h> #include <net/caif/caif_device.h> #include <net/caif/cfcnfg.h> #include <linux/err.h> #include <linux/debugfs.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sjur Brendeland"); MODULE_DESCRIPTION("CAIF serial device TTY line discipline"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_CAIF); #define SEND_QUEUE_LOW 10 #define SEND_QUEUE_HIGH 100 #define CAIF_SENDING 1 /* Bit 1 = 0x02*/ #define CAIF_FLOW_OFF_SENT 4 /* Bit 4 = 0x10 */ #define MAX_WRITE_CHUNK 4096 #define ON 1 #define OFF 0 #define CAIF_MAX_MTU 4096 static DEFINE_SPINLOCK(ser_lock); static LIST_HEAD(ser_list); static LIST_HEAD(ser_release_list); static bool ser_loop; module_param(ser_loop, bool, 0444); MODULE_PARM_DESC(ser_loop, "Run in simulated loopback mode."); static bool ser_use_stx = true; module_param(ser_use_stx, bool, 0444); MODULE_PARM_DESC(ser_use_stx, "STX enabled or not."); static bool ser_use_fcs = true; module_param(ser_use_fcs, bool, 0444); MODULE_PARM_DESC(ser_use_fcs, "FCS enabled or not."); static int ser_write_chunk = MAX_WRITE_CHUNK; module_param(ser_write_chunk, int, 0444); MODULE_PARM_DESC(ser_write_chunk, "Maximum size of data written to UART."); static struct dentry *debugfsdir; static int caif_net_open(struct net_device *dev); static int caif_net_close(struct net_device *dev); struct ser_device { struct caif_dev_common common; struct list_head node; struct net_device *dev; struct sk_buff_head head; struct tty_struct *tty; bool tx_started; unsigned long state; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_tty_dir; struct debugfs_blob_wrapper tx_blob; struct debugfs_blob_wrapper rx_blob; u8 rx_data[128]; u8 tx_data[128]; u8 tty_status; #endif }; static void caifdev_setup(struct net_device *dev); static void ldisc_tx_wakeup(struct tty_struct *tty); #ifdef CONFIG_DEBUG_FS static inline void update_tty_status(struct ser_device *ser) { ser->tty_status = ser->tty->flow.stopped << 5 | ser->tty->flow.tco_stopped << 3 | ser->tty->ctrl.packet << 2; } static inline void debugfs_init(struct ser_device *ser, struct tty_struct *tty) { ser->debugfs_tty_dir = debugfs_create_dir(tty->name, debugfsdir); debugfs_create_blob("last_tx_msg", 0400, ser->debugfs_tty_dir, &ser->tx_blob); debugfs_create_blob("last_rx_msg", 0400, ser->debugfs_tty_dir, &ser->rx_blob); debugfs_create_xul("ser_state", 0400, ser->debugfs_tty_dir, &ser->state); debugfs_create_x8("tty_status", 0400, ser->debugfs_tty_dir, &ser->tty_status); ser->tx_blob.data = ser->tx_data; ser->tx_blob.size = 0; ser->rx_blob.data = ser->rx_data; ser->rx_blob.size = 0; } static inline void debugfs_deinit(struct ser_device *ser) { debugfs_remove_recursive(ser->debugfs_tty_dir); } static inline void debugfs_rx(struct ser_device *ser, const u8 *data, int size) { if (size > sizeof(ser->rx_data)) size = sizeof(ser->rx_data); memcpy(ser->rx_data, data, size); ser->rx_blob.data = ser->rx_data; ser->rx_blob.size = size; } #else static inline void debugfs_init(struct ser_device *ser, struct tty_struct *tty) { } static inline void debugfs_deinit(struct ser_device *ser) { } static inline void update_tty_status(struct ser_device *ser) { } static inline void debugfs_rx(struct ser_device *ser, const u8 *data, int size) { } #endif static void ldisc_receive(struct tty_struct *tty, const u8 *data, const u8 *flags, size_t count) { struct sk_buff *skb = NULL; struct ser_device *ser; int ret; ser = tty->disc_data; /* * NOTE: flags may contain information about break or overrun. * This is not yet handled. */ /* * Workaround for garbage at start of transmission, * only enable if STX handling is not enabled. */ if (!ser->common.use_stx && !ser->tx_started) { dev_info(&ser->dev->dev, "Bytes received before initial transmission -" "bytes discarded.\n"); return; } BUG_ON(ser->dev == NULL); /* Get a suitable caif packet and copy in data. */ skb = netdev_alloc_skb(ser->dev, count+1); if (skb == NULL) return; skb_put_data(skb, data, count); skb->protocol = htons(ETH_P_CAIF); skb_reset_mac_header(skb); debugfs_rx(ser, data, count); /* Push received packet up the stack. */ ret = netif_rx(skb); if (!ret) { ser->dev->stats.rx_packets++; ser->dev->stats.rx_bytes += count; } else ++ser->dev->stats.rx_dropped; update_tty_status(ser); } static int handle_tx(struct ser_device *ser) { struct tty_struct *tty; struct sk_buff *skb; int tty_wr, len, room; tty = ser->tty; ser->tx_started = true; /* Enter critical section */ if (test_and_set_bit(CAIF_SENDING, &ser->state)) return 0; /* skb_peek is safe because handle_tx is called after skb_queue_tail */ while ((skb = skb_peek(&ser->head)) != NULL) { /* Make sure you don't write too much */ len = skb->len; room = tty_write_room(tty); if (!room) break; if (room > ser_write_chunk) room = ser_write_chunk; if (len > room) len = room; /* Write to tty or loopback */ if (!ser_loop) { tty_wr = tty->ops->write(tty, skb->data, len); update_tty_status(ser); } else { tty_wr = len; ldisc_receive(tty, skb->data, NULL, len); } ser->dev->stats.tx_packets++; ser->dev->stats.tx_bytes += tty_wr; /* Error on TTY ?! */ if (tty_wr < 0) goto error; /* Reduce buffer written, and discard if empty */ skb_pull(skb, tty_wr); if (skb->len == 0) { struct sk_buff *tmp = skb_dequeue(&ser->head); WARN_ON(tmp != skb); dev_consume_skb_any(skb); } } /* Send flow off if queue is empty */ if (ser->head.qlen <= SEND_QUEUE_LOW && test_and_clear_bit(CAIF_FLOW_OFF_SENT, &ser->state) && ser->common.flowctrl != NULL) ser->common.flowctrl(ser->dev, ON); clear_bit(CAIF_SENDING, &ser->state); return 0; error: clear_bit(CAIF_SENDING, &ser->state); return tty_wr; } static netdev_tx_t caif_xmit(struct sk_buff *skb, struct net_device *dev) { struct ser_device *ser; ser = netdev_priv(dev); /* Send flow off once, on high water mark */ if (ser->head.qlen > SEND_QUEUE_HIGH && !test_and_set_bit(CAIF_FLOW_OFF_SENT, &ser->state) && ser->common.flowctrl != NULL) ser->common.flowctrl(ser->dev, OFF); skb_queue_tail(&ser->head, skb); return handle_tx(ser); } static void ldisc_tx_wakeup(struct tty_struct *tty) { struct ser_device *ser; ser = tty->disc_data; BUG_ON(ser == NULL); WARN_ON(ser->tty != tty); handle_tx(ser); } static void ser_release(struct work_struct *work) { struct list_head list; struct ser_device *ser, *tmp; spin_lock(&ser_lock); list_replace_init(&ser_release_list, &list); spin_unlock(&ser_lock); if (!list_empty(&list)) { rtnl_lock(); list_for_each_entry_safe(ser, tmp, &list, node) { dev_close(ser->dev); unregister_netdevice(ser->dev); debugfs_deinit(ser); } rtnl_unlock(); } } static DECLARE_WORK(ser_release_work, ser_release); static int ldisc_open(struct tty_struct *tty) { struct ser_device *ser; struct net_device *dev; char name[64]; int result; /* No write no play */ if (tty->ops->write == NULL) return -EOPNOTSUPP; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_TTY_CONFIG)) return -EPERM; /* release devices to avoid name collision */ ser_release(NULL); result = snprintf(name, sizeof(name), "cf%s", tty->name); if (result >= IFNAMSIZ) return -EINVAL; dev = alloc_netdev(sizeof(*ser), name, NET_NAME_UNKNOWN, caifdev_setup); if (!dev) return -ENOMEM; ser = netdev_priv(dev); ser->tty = tty_kref_get(tty); ser->dev = dev; debugfs_init(ser, tty); tty->receive_room = 4096; tty->disc_data = ser; set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); rtnl_lock(); result = register_netdevice(dev); if (result) { tty_kref_put(tty); rtnl_unlock(); free_netdev(dev); return -ENODEV; } spin_lock(&ser_lock); list_add(&ser->node, &ser_list); spin_unlock(&ser_lock); rtnl_unlock(); netif_stop_queue(dev); update_tty_status(ser); return 0; } static void ldisc_close(struct tty_struct *tty) { struct ser_device *ser = tty->disc_data; tty_kref_put(ser->tty); spin_lock(&ser_lock); list_move(&ser->node, &ser_release_list); spin_unlock(&ser_lock); schedule_work(&ser_release_work); } /* The line discipline structure. */ static struct tty_ldisc_ops caif_ldisc = { .owner = THIS_MODULE, .num = N_CAIF, .name = "n_caif", .open = ldisc_open, .close = ldisc_close, .receive_buf = ldisc_receive, .write_wakeup = ldisc_tx_wakeup }; static const struct net_device_ops netdev_ops = { .ndo_open = caif_net_open, .ndo_stop = caif_net_close, .ndo_start_xmit = caif_xmit }; static void caifdev_setup(struct net_device *dev) { struct ser_device *serdev = netdev_priv(dev); dev->features = 0; dev->netdev_ops = &netdev_ops; dev->type = ARPHRD_CAIF; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->mtu = CAIF_MAX_MTU; dev->priv_flags |= IFF_NO_QUEUE; dev->needs_free_netdev = true; skb_queue_head_init(&serdev->head); serdev->common.link_select = CAIF_LINK_LOW_LATENCY; serdev->common.use_frag = true; serdev->common.use_stx = ser_use_stx; serdev->common.use_fcs = ser_use_fcs; serdev->dev = dev; } static int caif_net_open(struct net_device *dev) { netif_wake_queue(dev); return 0; } static int caif_net_close(struct net_device *dev) { netif_stop_queue(dev); return 0; } static int __init caif_ser_init(void) { int ret; ret = tty_register_ldisc(&caif_ldisc); if (ret < 0) pr_err("cannot register CAIF ldisc=%d err=%d\n", N_CAIF, ret); debugfsdir = debugfs_create_dir("caif_serial", NULL); return ret; } static void __exit caif_ser_exit(void) { spin_lock(&ser_lock); list_splice(&ser_list, &ser_release_list); spin_unlock(&ser_lock); ser_release(NULL); cancel_work_sync(&ser_release_work); tty_unregister_ldisc(&caif_ldisc); debugfs_remove_recursive(debugfsdir); } module_init(caif_ser_init); module_exit(caif_ser_exit);
30 9 30 8 191 190 1 188 191 188 191 20 6 17 20 25 23 25 24 25 45 25 20 45 136 137 48 184 48 136 48 184 190 46 190 190 191 1 190 191 190 190 190 1 189 190 1 48 190 190 138 52 51 6 48 137 184 189 163 29 26 46 1 45 12 209 140 102 2 2 194 1 1 1 3 3 3 2 3 2 1 6 4 98 1 265 265 1 9 192 1 9 4 8 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 // SPDX-License-Identifier: GPL-2.0-or-later /* AF_RXRPC sendmsg() implementation. * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/gfp.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" /* * Propose an abort to be made in the I/O thread. */ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, enum rxrpc_abort_reason why) { _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); if (!call->send_abort && !rxrpc_call_is_complete(call)) { call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; trace_rxrpc_abort_call(call, abort_code); /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); return true; } return false; } /* * Wait for a call to become connected. Interruption here doesn't cause the * call to be aborted. */ static int rxrpc_wait_to_be_connected(struct rxrpc_call *call, long *timeo) { DECLARE_WAITQUEUE(myself, current); int ret = 0; _enter("%d", call->debug_id); if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) goto no_wait; add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: case RXRPC_PREINTERRUPTIBLE: set_current_state(TASK_INTERRUPTIBLE); break; case RXRPC_UNINTERRUPTIBLE: default: set_current_state(TASK_UNINTERRUPTIBLE); break; } if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) break; if ((call->interruptibility == RXRPC_INTERRUPTIBLE || call->interruptibility == RXRPC_PREINTERRUPTIBLE) && signal_pending(current)) { ret = sock_intr_errno(*timeo); break; } *timeo = schedule_timeout(*timeo); } remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); no_wait: if (ret == 0 && rxrpc_call_is_complete(call)) ret = call->error; _leave(" = %d", ret); return ret; } /* * Return true if there's sufficient Tx queue space. */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); if (_tx_win) *_tx_win = tx_bottom; return call->send_top - tx_bottom < 256; } /* * Wait for space to appear in the Tx queue or a signal to occur. */ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (signal_pending(current)) return sock_intr_errno(*timeo); trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * Wait for space to appear in the Tx queue uninterruptibly, but with * a timeout of 2*RTT if no progress was made and a signal occurred. */ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, &tx_win)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (timeout == 0 && tx_win == tx_start && signal_pending(current)) return -EINTR; if (tx_win != tx_start) { timeout = rtt; tx_start = tx_win; } trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); timeout = schedule_timeout(timeout); } } /* * Wait for space to appear in the Tx queue uninterruptibly. */ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked */ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo, bool waitall) { DECLARE_WAITQUEUE(myself, current); int ret; _enter(",{%u,%u,%u}", call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: if (waitall) ret = rxrpc_wait_for_tx_window_waitall(rx, call); else ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); break; case RXRPC_PREINTERRUPTIBLE: case RXRPC_UNINTERRUPTIBLE: default: ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo); break; } remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); _leave(" = %d", ret); return ret; } /* * Notify the owner of the call that the transmit phase is ended and the last * packet has been queued. */ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx_t notify_end_tx) { if (notify_end_tx) notify_end_tx(&rx->sk, call, call->user_call_ID); } /* * Queue a DATA packet for transmission, set the resend timeout and send * the packet immediately. Returns the error from rxrpc_send_data_packet() * in case the caller wants to do something with it. */ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; bool poke, last = txb->flags & RXRPC_LAST_PACKET; int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); if (WARN_ON_ONCE(sq->bufs[ix])) trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); else trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); /* Add the packet to the call's output buffer */ poke = (READ_ONCE(call->tx_bottom) == call->send_top); sq->bufs[ix] = txb; /* Order send_top after the queue->next pointer and txb content. */ smp_store_release(&call->send_top, seq); if (last) { set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); call->send_queue = NULL; } if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* * Allocate a new txqueue unit and add it to the transmission queue. */ static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) { struct rxrpc_txqueue *tq; tq = kzalloc(sizeof(*tq), sk->sk_allocation); if (!tq) return -ENOMEM; tq->xmit_ts_base = KTIME_MIN; for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) tq->segment_xmit_ts[i] = UINT_MAX; if (call->send_queue) { tq->qbase = call->send_top + 1; call->send_queue->next = tq; call->send_queue = tq; } else if (WARN_ON(call->tx_queue)) { kfree(tq); return -ENOMEM; } else { /* We start at seq 1, so pretend seq 0 is hard-acked. */ tq->nr_reported_acks = 1; tq->segment_acked = 1UL; tq->qbase = 0; call->tx_qbase = 0; call->send_queue = tq; call->tx_qtail = tq; call->tx_queue = tq; } trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); return 0; } /* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx, bool *_dropped_lock) { struct rxrpc_txbuf *txb; struct sock *sk = &rx->sk; enum rxrpc_call_state state; long timeo; bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, call->cid, call->call_id, call->rx_consumed, 0, -EPROTO); return -EPROTO; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); if (ret < 0) return ret; if (call->conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = rxrpc_init_client_conn_security(call->conn); if (ret < 0) return ret; } /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); reload: txb = call->tx_pending; call->tx_pending = NULL; if (txb) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; state = rxrpc_call_state(call); ret = -ESHUTDOWN; if (state >= RXRPC_CALL_COMPLETE) goto maybe_error; ret = -EPROTO; if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && state != RXRPC_CALL_SERVER_ACK_REQUEST && state != RXRPC_CALL_SERVER_SEND_REPLY) { /* Request phase complete for this client call */ trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, call->cid, call->call_id, call->rx_consumed, 0, -EPROTO); goto maybe_error; } ret = -EMSGSIZE; if (call->tx_total_len != -1) { if (len - copied > call->tx_total_len) goto maybe_error; if (!more && len - copied != call->tx_total_len) goto maybe_error; } do { if (!txb) { size_t remain; _debug("alloc"); if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; /* See if we need to begin/extend the Tx queue. */ if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { ret = rxrpc_alloc_txqueue(sk, call); if (ret < 0) goto maybe_error; } /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. */ remain = more ? INT_MAX : msg_data_left(msg); txb = call->conn->security->alloc_txbuf(call, remain, sk->sk_allocation); if (!txb) { ret = -ENOMEM; goto maybe_error; } } _debug("append"); /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); txb->space -= copy; txb->len += copy; txb->offset += copy; copied += copy; if (call->tx_total_len != -1) call->tx_total_len -= copy; } /* check for the far side aborting the call or a network error * occurring */ if (rxrpc_call_is_complete(call)) goto call_terminated; /* add the packet to the send queue if it's now full */ if (!txb->space || (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } } while (msg_data_left(msg) > 0); success: ret = copied; if (rxrpc_call_is_complete(call) && call->error < 0) ret = call->error; out: call->tx_pending = txb; _leave(" = %d", ret); return ret; call_terminated: rxrpc_put_txbuf(txb, rxrpc_txbuf_put_send_aborted); _leave(" = %d", call->error); return call->error; maybe_error: if (copied) goto success; goto out; efault: ret = -EFAULT; goto out; wait_for_space: ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; mutex_unlock(&call->user_mutex); *_dropped_lock = true; ret = rxrpc_wait_for_tx_window(rx, call, &timeo, msg->msg_flags & MSG_WAITALL); if (ret < 0) goto maybe_error; if (call->interruptibility == RXRPC_INTERRUPTIBLE) { if (mutex_lock_interruptible(&call->user_mutex) < 0) { ret = sock_intr_errno(timeo); goto maybe_error; } } else { mutex_lock(&call->user_mutex); } *_dropped_lock = false; goto reload; } /* * extract control messages from the sendmsg() control buffer */ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) { struct cmsghdr *cmsg; bool got_user_ID = false; int len; if (msg->msg_controllen == 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; len = cmsg->cmsg_len - sizeof(struct cmsghdr); _debug("CMSG %d, %d, %d", cmsg->cmsg_level, cmsg->cmsg_type, len); if (cmsg->cmsg_level != SOL_RXRPC) continue; switch (cmsg->cmsg_type) { case RXRPC_USER_CALL_ID: if (msg->msg_flags & MSG_CMSG_COMPAT) { if (len != sizeof(u32)) return -EINVAL; p->call.user_call_ID = *(u32 *)CMSG_DATA(cmsg); } else { if (len != sizeof(unsigned long)) return -EINVAL; p->call.user_call_ID = *(unsigned long *) CMSG_DATA(cmsg); } got_user_ID = true; break; case RXRPC_ABORT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_SEND_ABORT; if (len != sizeof(p->abort_code)) return -EINVAL; p->abort_code = *(unsigned int *)CMSG_DATA(cmsg); if (p->abort_code == 0) return -EINVAL; break; case RXRPC_CHARGE_ACCEPT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_CHARGE_ACCEPT; if (len != 0) return -EINVAL; break; case RXRPC_EXCLUSIVE_CALL: p->exclusive = true; if (len != 0) return -EINVAL; break; case RXRPC_UPGRADE_SERVICE: p->upgrade = true; if (len != 0) return -EINVAL; break; case RXRPC_TX_LENGTH: if (p->call.tx_total_len != -1 || len != sizeof(__s64)) return -EINVAL; p->call.tx_total_len = *(__s64 *)CMSG_DATA(cmsg); if (p->call.tx_total_len < 0) return -EINVAL; break; case RXRPC_SET_CALL_TIMEOUT: if (len & 3 || len < 4 || len > 12) return -EINVAL; memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len); p->call.nr_timeouts = len / 4; if (p->call.timeouts.hard > INT_MAX / HZ) return -ERANGE; if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000) return -ERANGE; if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000) return -ERANGE; break; default: return -EINVAL; } } if (!got_user_ID) return -EINVAL; if (p->call.tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; _leave(" = 0"); return 0; } /* * Create a new client call for sendmsg(). * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; struct rxrpc_peer *peer; struct rxrpc_call *call; struct key *key; DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); _enter(""); if (!msg->msg_name) { release_sock(&rx->sk); return ERR_PTR(-EDESTADDRREQ); } peer = rxrpc_lookup_peer(rx->local, srx, GFP_KERNEL); if (!peer) { release_sock(&rx->sk); return ERR_PTR(-ENOMEM); } key = rx->key; if (key && !rx->key->payload.data[0]) key = NULL; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.peer = peer; cp.key = rx->key; cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, &p->call, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ rxrpc_put_peer(peer, rxrpc_peer_put_application); _leave(" = %p\n", call); return call; } /* * send a message forming part of a client call through an RxRPC socket * - caller holds the socket locked * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) { struct rxrpc_call *call; bool dropped_lock = false; int ret; struct rxrpc_send_params p = { .call.tx_total_len = -1, .call.user_call_ID = 0, .call.nr_timeouts = 0, .call.interruptibility = RXRPC_INTERRUPTIBLE, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, .upgrade = false, }; _enter(""); ret = rxrpc_sendmsg_cmsg(msg, &p); if (ret < 0) goto error_release_sock; if (p.command == RXRPC_CMD_CHARGE_ACCEPT) { ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) goto error_release_sock; ret = rxrpc_user_charge_accept(rx, p.call.user_call_ID); goto error_release_sock; } call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID); if (!call) { ret = -EBADSLT; if (p.command != RXRPC_CMD_SEND_DATA) goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, &p); /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); /* ... and we have the call lock. */ p.call.nr_timeouts = 0; ret = 0; if (rxrpc_call_is_complete(call)) goto out_put_unlock; } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; case RXRPC_CALL_UNINITIALISED: case RXRPC_CALL_SERVER_PREALLOC: rxrpc_put_call(call, rxrpc_call_put_sendmsg); ret = -EBUSY; goto error_release_sock; default: break; } ret = mutex_lock_interruptible(&call->user_mutex); release_sock(&rx->sk); if (ret < 0) { ret = -ERESTARTSYS; goto error_put; } if (p.call.tx_total_len != -1) { ret = -EINVAL; if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } switch (p.call.nr_timeouts) { case 3: WRITE_ONCE(call->next_rx_timo, p.call.timeouts.normal); fallthrough; case 2: WRITE_ONCE(call->next_req_timo, p.call.timeouts.idle); fallthrough; case 1: if (p.call.timeouts.hard > 0) { ktime_t delay = ms_to_ktime(p.call.timeouts.hard * MSEC_PER_SEC); WRITE_ONCE(call->expect_term_by, ktime_add(p.call.timeouts.hard, ktime_get_real())); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); } break; } if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; goto out_put_unlock; } switch (p.command) { case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; break; case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); break; default: ret = -EINVAL; break; } out_put_unlock: if (!dropped_lock) mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put_sendmsg); _leave(" = %d", ret); return ret; error_release_sock: release_sock(&rx->sk); return ret; } /** * rxrpc_kernel_send_data - Allow a kernel service to send data on a call * @sock: The socket the call is on * @call: The call to send data through * @msg: The data to send * @len: The amount of data to send * @notify_end_tx: Notification that the last packet is queued. * * Allow a kernel service to send data on a call. The call must be in an state * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. * * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx) { bool dropped_lock = false; int ret; _enter("{%d},", call->debug_id); ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); mutex_lock(&call->user_mutex); ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, notify_end_tx, &dropped_lock); if (ret == -ESHUTDOWN) ret = call->error; if (!dropped_lock) mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } EXPORT_SYMBOL(rxrpc_kernel_send_data); /** * rxrpc_kernel_abort_call - Allow a kernel service to abort a call * @sock: The socket the call is on * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet * @error: Local error value * @why: Indication as to why. * * Allow a kernel service to abort a call if it's still in an abortable state. * * Return: %true if the call was aborted, %false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) { bool aborted; _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); aborted = rxrpc_propose_abort(call, abort_code, error, why); mutex_unlock(&call->user_mutex); return aborted; } EXPORT_SYMBOL(rxrpc_kernel_abort_call); /** * rxrpc_kernel_set_tx_length - Set the total Tx length on a call * @sock: The socket the call is on * @call: The call to be informed * @tx_total_len: The amount of data to be transmitted for this call * * Allow a kernel service to set the total transmit length on a call. This * allows buffer-to-packet encrypt-and-copy to be performed. * * This function is primarily for use for setting the reply length since the * request length can be set when beginning the call. */ void rxrpc_kernel_set_tx_length(struct socket *sock, struct rxrpc_call *call, s64 tx_total_len) { WARN_ON(call->tx_total_len != -1); call->tx_total_len = tx_total_len; } EXPORT_SYMBOL(rxrpc_kernel_set_tx_length);
209 1 123 120 119 119 67 2 119 133 129 1 2 1 122 127 120 123 123 123 122 119 119 119 119 119 119 119 120 119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 // SPDX-License-Identifier: GPL-2.0-or-later /* Decoder for ASN.1 BER/DER/CER encoded bytestream * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/asn1_decoder.h> #include <linux/asn1_ber_bytecode.h> static const unsigned char asn1_op_lengths[ASN1_OP__NR] = { /* OPC TAG JMP ACT */ [ASN1_OP_MATCH] = 1 + 1, [ASN1_OP_MATCH_OR_SKIP] = 1 + 1, [ASN1_OP_MATCH_ACT] = 1 + 1 + 1, [ASN1_OP_MATCH_ACT_OR_SKIP] = 1 + 1 + 1, [ASN1_OP_MATCH_JUMP] = 1 + 1 + 1, [ASN1_OP_MATCH_JUMP_OR_SKIP] = 1 + 1 + 1, [ASN1_OP_MATCH_ANY] = 1, [ASN1_OP_MATCH_ANY_OR_SKIP] = 1, [ASN1_OP_MATCH_ANY_ACT] = 1 + 1, [ASN1_OP_MATCH_ANY_ACT_OR_SKIP] = 1 + 1, [ASN1_OP_COND_MATCH_OR_SKIP] = 1 + 1, [ASN1_OP_COND_MATCH_ACT_OR_SKIP] = 1 + 1 + 1, [ASN1_OP_COND_MATCH_JUMP_OR_SKIP] = 1 + 1 + 1, [ASN1_OP_COND_MATCH_ANY] = 1, [ASN1_OP_COND_MATCH_ANY_OR_SKIP] = 1, [ASN1_OP_COND_MATCH_ANY_ACT] = 1 + 1, [ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP] = 1 + 1, [ASN1_OP_COND_FAIL] = 1, [ASN1_OP_COMPLETE] = 1, [ASN1_OP_ACT] = 1 + 1, [ASN1_OP_MAYBE_ACT] = 1 + 1, [ASN1_OP_RETURN] = 1, [ASN1_OP_END_SEQ] = 1, [ASN1_OP_END_SEQ_OF] = 1 + 1, [ASN1_OP_END_SET] = 1, [ASN1_OP_END_SET_OF] = 1 + 1, [ASN1_OP_END_SEQ_ACT] = 1 + 1, [ASN1_OP_END_SEQ_OF_ACT] = 1 + 1 + 1, [ASN1_OP_END_SET_ACT] = 1 + 1, [ASN1_OP_END_SET_OF_ACT] = 1 + 1 + 1, }; /* * Find the length of an indefinite length object * @data: The data buffer * @datalen: The end of the innermost containing element in the buffer * @_dp: The data parse cursor (updated before returning) * @_len: Where to return the size of the element. * @_errmsg: Where to return a pointer to an error message on error */ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen, size_t *_dp, size_t *_len, const char **_errmsg) { unsigned char tag, tmp; size_t dp = *_dp, len, n; int indef_level = 1; next_tag: if (unlikely(datalen - dp < 2)) { if (datalen == dp) goto missing_eoc; goto data_overrun_error; } /* Extract a tag from the data */ tag = data[dp++]; if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; if (--indef_level <= 0) { *_len = dp - *_dp; *_dp = dp; return 0; } goto next_tag; } if (unlikely((tag & 0x1f) == ASN1_LONG_TAG)) { do { if (unlikely(datalen - dp < 2)) goto data_overrun_error; tmp = data[dp++]; } while (tmp & 0x80); } /* Extract the length */ len = data[dp++]; if (len <= 0x7f) goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ if (unlikely((tag & ASN1_CONS_BIT) == ASN1_PRIM << 5)) goto indefinite_len_primitive; indef_level++; goto next_tag; } n = len - 0x80; if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; len = 0; for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } check_length: if (len > datalen - dp) goto data_overrun_error; dp += len; goto next_tag; length_too_long: *_errmsg = "Unsupported length"; goto error; indefinite_len_primitive: *_errmsg = "Indefinite len primitive not permitted"; goto error; invalid_eoc: *_errmsg = "Invalid length EOC"; goto error; data_overrun_error: *_errmsg = "Data overrun error"; goto error; missing_eoc: *_errmsg = "Missing EOC in indefinite len cons"; error: *_dp = dp; return -1; } /** * asn1_ber_decoder - Decoder BER/DER/CER ASN.1 according to pattern * @decoder: The decoder definition (produced by asn1_compiler) * @context: The caller's context (to be passed to the action functions) * @data: The encoded data * @datalen: The size of the encoded data * * Decode BER/DER/CER encoded ASN.1 data according to a bytecode pattern * produced by asn1_compiler. Action functions are called on marked tags to * allow the caller to retrieve significant data. * * LIMITATIONS: * * To keep down the amount of stack used by this function, the following limits * have been imposed: * * (1) This won't handle datalen > 65535 without increasing the size of the * cons stack elements and length_too_long checking. * * (2) The stack of constructed types is 10 deep. If the depth of non-leaf * constructed types exceeds this, the decode will fail. * * (3) The SET type (not the SET OF type) isn't really supported as tracking * what members of the set have been seen is a pain. */ int asn1_ber_decoder(const struct asn1_decoder *decoder, void *context, const unsigned char *data, size_t datalen) { const unsigned char *machine = decoder->machine; const asn1_action_t *actions = decoder->actions; size_t machlen = decoder->machlen; enum asn1_opcode op; unsigned char tag = 0, csp = 0, jsp = 0, optag = 0, hdr = 0; const char *errmsg; size_t pc = 0, dp = 0, tdp = 0, len = 0; int ret; unsigned char flags = 0; #define FLAG_INDEFINITE_LENGTH 0x01 #define FLAG_MATCHED 0x02 #define FLAG_LAST_MATCHED 0x04 /* Last tag matched */ #define FLAG_CONS 0x20 /* Corresponds to CONS bit in the opcode tag * - ie. whether or not we are going to parse * a compound type. */ #define NR_CONS_STACK 10 unsigned short cons_dp_stack[NR_CONS_STACK]; unsigned short cons_datalen_stack[NR_CONS_STACK]; unsigned char cons_hdrlen_stack[NR_CONS_STACK]; #define NR_JUMP_STACK 10 unsigned char jump_stack[NR_JUMP_STACK]; if (datalen > 65535) return -EMSGSIZE; next_op: pr_debug("next_op: pc=\e[32m%zu\e[m/%zu dp=\e[33m%zu\e[m/%zu C=%d J=%d\n", pc, machlen, dp, datalen, csp, jsp); if (unlikely(pc >= machlen)) goto machine_overrun_error; op = machine[pc]; if (unlikely(pc + asn1_op_lengths[op] > machlen)) goto machine_overrun_error; /* If this command is meant to match a tag, then do that before * evaluating the command. */ if (op <= ASN1_OP__MATCHES_TAG) { unsigned char tmp; /* Skip conditional matches if possible */ if ((op & ASN1_OP_MATCH__COND && flags & FLAG_MATCHED) || (op & ASN1_OP_MATCH__SKIP && dp == datalen)) { flags &= ~FLAG_LAST_MATCHED; pc += asn1_op_lengths[op]; goto next_op; } flags = 0; hdr = 2; /* Extract a tag from the data */ if (unlikely(datalen - dp < 2)) goto data_overrun_error; tag = data[dp++]; if (unlikely((tag & 0x1f) == ASN1_LONG_TAG)) goto long_tag_not_supported; if (op & ASN1_OP_MATCH__ANY) { pr_debug("- any %02x\n", tag); } else { /* Extract the tag from the machine * - Either CONS or PRIM are permitted in the data if * CONS is not set in the op stream, otherwise CONS * is mandatory. */ optag = machine[pc + 1]; flags |= optag & FLAG_CONS; /* Determine whether the tag matched */ tmp = optag ^ tag; tmp &= ~(optag & ASN1_CONS_BIT); pr_debug("- match? %02x %02x %02x\n", tag, optag, tmp); if (tmp != 0) { /* All odd-numbered tags are MATCH_OR_SKIP. */ if (op & ASN1_OP_MATCH__SKIP) { pc += asn1_op_lengths[op]; dp--; goto next_op; } goto tag_mismatch; } } flags |= FLAG_MATCHED; len = data[dp++]; if (len > 0x7f) { if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ if (unlikely(!(tag & ASN1_CONS_BIT))) goto indefinite_len_primitive; flags |= FLAG_INDEFINITE_LENGTH; if (unlikely(2 > datalen - dp)) goto data_overrun_error; } else { int n = len - 0x80; if (unlikely(n > 2)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; hdr += n; for (len = 0; n > 0; n--) { len <<= 8; len |= data[dp++]; } if (unlikely(len > datalen - dp)) goto data_overrun_error; } } else { if (unlikely(len > datalen - dp)) goto data_overrun_error; } if (flags & FLAG_CONS) { /* For expected compound forms, we stack the positions * of the start and end of the data. */ if (unlikely(csp >= NR_CONS_STACK)) goto cons_stack_overflow; cons_dp_stack[csp] = dp; cons_hdrlen_stack[csp] = hdr; if (!(flags & FLAG_INDEFINITE_LENGTH)) { cons_datalen_stack[csp] = datalen; datalen = dp + len; } else { cons_datalen_stack[csp] = 0; } csp++; } pr_debug("- TAG: %02x %zu%s\n", tag, len, flags & FLAG_CONS ? " CONS" : ""); tdp = dp; } /* Decide how to handle the operation */ switch (op) { case ASN1_OP_MATCH: case ASN1_OP_MATCH_OR_SKIP: case ASN1_OP_MATCH_ACT: case ASN1_OP_MATCH_ACT_OR_SKIP: case ASN1_OP_MATCH_ANY: case ASN1_OP_MATCH_ANY_OR_SKIP: case ASN1_OP_MATCH_ANY_ACT: case ASN1_OP_MATCH_ANY_ACT_OR_SKIP: case ASN1_OP_COND_MATCH_OR_SKIP: case ASN1_OP_COND_MATCH_ACT_OR_SKIP: case ASN1_OP_COND_MATCH_ANY: case ASN1_OP_COND_MATCH_ANY_OR_SKIP: case ASN1_OP_COND_MATCH_ANY_ACT: case ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP: if (!(flags & FLAG_CONS)) { if (flags & FLAG_INDEFINITE_LENGTH) { size_t tmp = dp; ret = asn1_find_indefinite_length( data, datalen, &tmp, &len, &errmsg); if (ret < 0) goto error; } pr_debug("- LEAF: %zu\n", len); } if (op & ASN1_OP_MATCH__ACT) { unsigned char act; if (op & ASN1_OP_MATCH__ANY) act = machine[pc + 1]; else act = machine[pc + 2]; ret = actions[act](context, hdr, tag, data + dp, len); if (ret < 0) return ret; } if (!(flags & FLAG_CONS)) dp += len; pc += asn1_op_lengths[op]; goto next_op; case ASN1_OP_MATCH_JUMP: case ASN1_OP_MATCH_JUMP_OR_SKIP: case ASN1_OP_COND_MATCH_JUMP_OR_SKIP: pr_debug("- MATCH_JUMP\n"); if (unlikely(jsp == NR_JUMP_STACK)) goto jump_stack_overflow; jump_stack[jsp++] = pc + asn1_op_lengths[op]; pc = machine[pc + 2]; goto next_op; case ASN1_OP_COND_FAIL: if (unlikely(!(flags & FLAG_MATCHED))) goto tag_mismatch; pc += asn1_op_lengths[op]; goto next_op; case ASN1_OP_COMPLETE: if (unlikely(jsp != 0 || csp != 0)) { pr_err("ASN.1 decoder error: Stacks not empty at completion (%u, %u)\n", jsp, csp); return -EBADMSG; } return 0; case ASN1_OP_END_SET: case ASN1_OP_END_SET_ACT: if (unlikely(!(flags & FLAG_MATCHED))) goto tag_mismatch; fallthrough; case ASN1_OP_END_SEQ: case ASN1_OP_END_SET_OF: case ASN1_OP_END_SEQ_OF: case ASN1_OP_END_SEQ_ACT: case ASN1_OP_END_SET_OF_ACT: case ASN1_OP_END_SEQ_OF_ACT: if (unlikely(csp <= 0)) goto cons_stack_underflow; csp--; tdp = cons_dp_stack[csp]; hdr = cons_hdrlen_stack[csp]; len = datalen; datalen = cons_datalen_stack[csp]; pr_debug("- end cons t=%zu dp=%zu l=%zu/%zu\n", tdp, dp, len, datalen); if (datalen == 0) { /* Indefinite length - check for the EOC. */ datalen = len; if (unlikely(datalen - dp < 2)) goto data_overrun_error; if (data[dp++] != 0) { if (op & ASN1_OP_END__OF) { dp--; csp++; pc = machine[pc + 1]; pr_debug("- continue\n"); goto next_op; } goto missing_eoc; } if (data[dp++] != 0) goto invalid_eoc; len = dp - tdp - 2; } else { if (dp < len && (op & ASN1_OP_END__OF)) { datalen = len; csp++; pc = machine[pc + 1]; pr_debug("- continue\n"); goto next_op; } if (dp != len) goto cons_length_error; len -= tdp; pr_debug("- cons len l=%zu d=%zu\n", len, dp - tdp); } if (op & ASN1_OP_END__ACT) { unsigned char act; if (op & ASN1_OP_END__OF) act = machine[pc + 2]; else act = machine[pc + 1]; ret = actions[act](context, hdr, 0, data + tdp, len); if (ret < 0) return ret; } pc += asn1_op_lengths[op]; goto next_op; case ASN1_OP_MAYBE_ACT: if (!(flags & FLAG_LAST_MATCHED)) { pc += asn1_op_lengths[op]; goto next_op; } fallthrough; case ASN1_OP_ACT: ret = actions[machine[pc + 1]](context, hdr, tag, data + tdp, len); if (ret < 0) return ret; pc += asn1_op_lengths[op]; goto next_op; case ASN1_OP_RETURN: if (unlikely(jsp <= 0)) goto jump_stack_underflow; pc = jump_stack[--jsp]; flags |= FLAG_MATCHED | FLAG_LAST_MATCHED; goto next_op; default: break; } /* Shouldn't reach here */ pr_err("ASN.1 decoder error: Found reserved opcode (%u) pc=%zu\n", op, pc); return -EBADMSG; data_overrun_error: errmsg = "Data overrun error"; goto error; machine_overrun_error: errmsg = "Machine overrun error"; goto error; jump_stack_underflow: errmsg = "Jump stack underflow"; goto error; jump_stack_overflow: errmsg = "Jump stack overflow"; goto error; cons_stack_underflow: errmsg = "Cons stack underflow"; goto error; cons_stack_overflow: errmsg = "Cons stack overflow"; goto error; cons_length_error: errmsg = "Cons length error"; goto error; missing_eoc: errmsg = "Missing EOC in indefinite len cons"; goto error; invalid_eoc: errmsg = "Invalid length EOC"; goto error; length_too_long: errmsg = "Unsupported length"; goto error; indefinite_len_primitive: errmsg = "Indefinite len primitive not permitted"; goto error; tag_mismatch: errmsg = "Unexpected tag"; goto error; long_tag_not_supported: errmsg = "Long tag not supported"; error: pr_debug("\nASN1: %s [m=%zu d=%zu ot=%02x t=%02x l=%zu]\n", errmsg, pc, dp, optag, tag, len); return -EBADMSG; } EXPORT_SYMBOL_GPL(asn1_ber_decoder); MODULE_DESCRIPTION("Decoder for ASN.1 BER/DER/CER encoded bytestream"); MODULE_LICENSE("GPL");
29 91 97 5924 2452 3444 171 1237 3553 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FILELOCK_H #define _LINUX_FILELOCK_H #include <linux/fs.h> #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_DELEG 4 /* NFSv4 delegation */ #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. */ #define FILE_LOCK_DEFERRED 1 struct file_lock; struct file_lease; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; struct lock_manager_operations { void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); bool (*lm_lock_expirable)(struct file_lock *cfl); void (*lm_expire_lock)(void); }; struct lease_manager_operations { bool (*lm_break)(struct file_lease *); int (*lm_change)(struct file_lease *, int, struct list_head *); void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); }; struct lock_manager { struct list_head list; /* * NFSv4 and up also want opens blocked during the grace period; * NLM doesn't care: */ bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); bool locks_in_grace(struct net *); bool opens_in_grace(struct net *); /* * struct file_lock has a union that some filesystems use to track * their own private info. The NFS side of things is defined here: */ #include <linux/nfs_fs_i.h> /* * struct file_lock represents a generic "file lock". It's used to represent * POSIX byte range locks, BSD (flock) locks, and leases. It's important to * note that the same struct is used to represent both a request for a lock and * the lock itself, but the same object is never used for both. * * FIXME: should we create a separate "struct lock_request" to help distinguish * these two uses? * * The varous i_flctx lists are ordered by: * * 1) lock owner * 2) lock range start * 3) lock range end * * Obviously, the last two criteria only matter for POSIX locks. */ struct file_lock_core { struct file_lock_core *flc_blocker; /* The lock that is blocking us */ struct list_head flc_list; /* link into file_lock_context */ struct hlist_node flc_link; /* node in global lists */ struct list_head flc_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ struct list_head flc_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ fl_owner_t flc_owner; unsigned int flc_flags; unsigned char flc_type; pid_t flc_pid; int flc_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t flc_wait; struct file *flc_file; }; struct file_lock { struct file_lock_core c; loff_t fl_start; loff_t fl_end; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; struct { struct list_head link; /* link in AFS vnode's pending_locks list */ int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; struct { struct inode *inode; } ceph; } fl_u; } __randomize_layout; struct file_lease { struct file_lock_core c; struct fasync_struct * fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */ } __randomize_layout; struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; struct list_head flc_posix; struct list_head flc_lease; }; #ifdef CONFIG_FILE_LOCKING int fcntl_getlk(struct file *, unsigned int, struct flock *); int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock *); #if BITS_PER_LONG == 32 int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg); int fcntl_getdeleg(struct file *filp, struct delegation *deleg); static inline bool lock_is_unlock(struct file_lock *fl) { return fl->c.flc_type == F_UNLCK; } static inline bool lock_is_read(struct file_lock *fl) { return fl->c.flc_type == F_RDLCK; } static inline bool lock_is_write(struct file_lock *fl) { return fl->c.flc_type == F_WRLCK; } static inline void locks_wake_up_waiter(struct file_lock_core *flc) { wake_up(&flc->flc_wait); } static inline void locks_wake_up(struct file_lock *fl) { locks_wake_up_waiter(&fl->c); } static inline bool locks_can_async_lock(const struct file_operations *fops) { return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK; } /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); void locks_init_lock(struct file_lock *); struct file_lock *locks_alloc_lock(void); void locks_copy_lock(struct file_lock *, struct file_lock *); void locks_copy_conflock(struct file_lock *, struct file_lock *); void locks_remove_posix(struct file *, fl_owner_t); void locks_remove_file(struct file *); void locks_release_private(struct file_lock *); void posix_test_lock(struct file *, struct file_lock *); int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); int locks_delete_block(struct file_lock *); int vfs_test_lock(struct file *, struct file_lock *); int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); int vfs_cancel_lock(struct file *filp, struct file_lock *fl); bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); #define LEASE_BREAK_LEASE BIT(0) // break leases and delegations #define LEASE_BREAK_DELEG BIT(1) // break delegations only #define LEASE_BREAK_LAYOUT BIT(2) // break layouts only #define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break #define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event int __break_lease(struct inode *inode, unsigned int flags); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); int vfs_setlease(struct file *, int, struct file_lease **, void **); int lease_modify(struct file_lease *, int, struct list_head *); struct notifier_block; int lease_register_notifier(struct notifier_block *); void lease_unregister_notifier(struct notifier_block *); struct files_struct; void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner); static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return smp_load_acquire(&inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) { return -EINVAL; } static inline int fcntl_setlk(unsigned int fd, struct file *file, unsigned int cmd, struct flock __user *user) { return -EACCES; } #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, struct flock64 *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, unsigned int cmd, struct flock64 *user) { return -EACCES; } #endif static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { return -EINVAL; } static inline int fcntl_getlease(struct file *filp) { return F_UNLCK; } static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) { return -EINVAL; } static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg) { return -EINVAL; } static inline bool lock_is_unlock(struct file_lock *fl) { return false; } static inline bool lock_is_read(struct file_lock *fl) { return false; } static inline bool lock_is_write(struct file_lock *fl) { return false; } static inline void locks_wake_up(struct file_lock *fl) { } static inline void locks_free_lock_context(struct inode *inode) { } static inline void locks_init_lock(struct file_lock *fl) { return; } static inline void locks_init_lease(struct file_lease *fl) { return; } static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { return; } static inline void locks_remove_file(struct file *filp) { return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return -ENOLCK; } static inline int locks_delete_block(struct file_lock *waiter) { return -ENOENT; } static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { return -ENOLCK; } static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline bool vfs_inode_has_locks(struct inode *inode) { return false; } static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; } static inline int __break_lease(struct inode *inode, unsigned int flags) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec64 *time) { return; } static inline int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; } static inline int kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { return -EINVAL; } struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { return false; } static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return NULL; } #endif /* !CONFIG_FILE_LOCKING */ /* for walking lists of file_locks linked by fl_list */ #define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, c.flc_list) static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(file_inode(filp), fl); } #ifdef CONFIG_FILE_LOCKING static inline unsigned int openmode_to_lease_flags(unsigned int mode) { unsigned int flags = 0; if ((mode & O_ACCMODE) == O_RDONLY) flags |= LEASE_BREAK_OPEN_RDONLY; if (mode & O_NONBLOCK) flags |= LEASE_BREAK_NONBLOCK; return flags; } static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode)); return 0; } static inline int break_deleg(struct inode *inode, unsigned int flags) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) { flags |= LEASE_BREAK_DELEG; return __break_lease(inode, flags); } return 0; } struct delegated_inode { struct inode *di_inode; }; static inline bool is_delegated(struct delegated_inode *di) { return di->di_inode; } static inline int try_break_deleg(struct inode *inode, struct delegated_inode *di) { int ret; ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); if (ret == -EWOULDBLOCK && di) { di->di_inode = inode; ihold(inode); } return ret; } static inline int break_deleg_wait(struct delegated_inode *di) { int ret; ret = break_deleg(di->di_inode, 0); iput(di->di_inode); di->di_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { unsigned int flags = LEASE_BREAK_LAYOUT; if (!wait) flags |= LEASE_BREAK_NONBLOCK; return __break_lease(inode, flags); } return 0; } #else /* !CONFIG_FILE_LOCKING */ struct delegated_inode { }; static inline bool is_delegated(struct delegated_inode *di) { return false; } static inline int break_lease(struct inode *inode, bool wait) { return 0; } static inline int break_deleg(struct inode *inode, unsigned int flags) { return 0; } static inline int try_break_deleg(struct inode *inode, struct delegated_inode *delegated_inode) { return 0; } static inline int break_deleg_wait(struct delegated_inode *delegated_inode) { BUG(); return 0; } static inline int break_layout(struct inode *inode, bool wait) { return 0; } #endif /* CONFIG_FILE_LOCKING */ #endif /* _LINUX_FILELOCK_H */
26 7 47 25 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GSO_H #define _NET_GSO_H #include <linux/skbuff.h> /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for * tunnel skb it points to outer mac header. * Keeps track of level of encapsulation of network headers. */ struct skb_gso_cb { union { int mac_offset; int data_offset; }; int encap_level; __wsum csum; __u16 csum_start; }; #define SKB_GSO_CB_OFFSET 32 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) { return (skb_mac_header(inner_skb) - inner_skb->head) - SKB_GSO_CB(inner_skb)->mac_offset; } static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) { int new_headroom, headroom; int ret; headroom = skb_headroom(skb); ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); if (ret) return ret; new_headroom = skb_headroom(skb); SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); return 0; } static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) { /* Do not update partial checksums if remote checksum is enabled. */ if (skb->remcsum_offload) return; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; } /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). * skb->csum and csum_start are then updated to reflect the checksum of the * resultant packet starting from the transport header-- the resultant checksum * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo * header. */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { unsigned char *csum_start = skb_transport_header(skb); int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; __wsum partial = SKB_GSO_CB(skb)->csum; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; return csum_fold(csum_partial(csum_start, plen, partial)); } struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) { skb->protocol = protocol; skb->encapsulation = 1; skb_push(skb, pulled_hlen); skb_reset_transport_header(skb); skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } #endif /* _NET_GSO_H */
15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> DEFINE_BPF_STORAGE_CACHE(cgroup_cache); static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { cant_migrate(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); } static bool bpf_cgrp_storage_trylock(void) { cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); return false; } return true; } static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; return &cg->bpf_cgrp_storage; } void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); out: rcu_read_unlock_migrate(); } static struct bpf_local_storage_data * cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *cgroup_storage; struct bpf_local_storage_map *smap; cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, bpf_rcu_lock_held()); if (!cgroup_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); } static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = cgroup_storage_lookup(cgroup, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; if (!cgroup) return (unsigned long)NULL; nobusy = bpf_cgrp_storage_trylock(); sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: if (nobusy) bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; if (!bpf_cgrp_storage_trylock()) return -EBUSY; ret = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); return ret; } const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_cgrp_storage_lookup_elem, .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .func = bpf_cgrp_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .func = bpf_cgrp_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], };
17 15 3 1 2 20 2 14 1 3 14 2 1 6 6 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 // SPDX-License-Identifier: GPL-2.0-only #include <net/ip.h> #include <net/tcp.h> #include <net/netfilter/nf_tables.h> #include <linux/netfilter/nfnetlink_osf.h> struct nft_osf { u8 dreg; u8 ttl; u32 flags; }; static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { [NFTA_OSF_DREG] = { .type = NLA_U32 }, [NFTA_OSF_TTL] = { .type = NLA_U8 }, [NFTA_OSF_FLAGS] = { .type = NLA_U32 }, }; static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; struct sk_buff *skb = pkt->skb; char os_match[NFT_OSF_MAXGENRELEN]; const struct tcphdr *tcp; struct nf_osf_data data; struct tcphdr _tcph; if (pkt->tprot != IPPROTO_TCP) { regs->verdict.code = NFT_BREAK; return; } tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); if (!tcp) { regs->verdict.code = NFT_BREAK; return; } if (!tcp->syn) { regs->verdict.code = NFT_BREAK; return; } if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); } else { if (priv->flags & NFT_OSF_F_VERSION) snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", data.genre, data.version); else strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN); } } static int nft_osf_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_osf *priv = nft_expr_priv(expr); u32 flags; u8 ttl; if (!tb[NFTA_OSF_DREG]) return -EINVAL; if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) return -EINVAL; priv->ttl = ttl; } if (tb[NFTA_OSF_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_OSF_FLAGS])); if (flags != NFT_OSF_F_VERSION) return -EINVAL; priv->flags = flags; } return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); } static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_osf *priv = nft_expr_priv(expr); if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) goto nla_put_failure; if (nla_put_u32(skb, NFTA_OSF_FLAGS, ntohl((__force __be32)priv->flags))) goto nla_put_failure; if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_osf_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hooks; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_FORWARD); break; default: return -EOPNOTSUPP; } return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_osf_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { struct nft_osf *priv = nft_expr_priv(expr); struct nft_osf *osf; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); return false; } osf = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->flags != osf->flags || priv->ttl != osf->ttl) { nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); return false; } if (!track->regs[priv->dreg].bitwise) return true; return false; } static struct nft_expr_type nft_osf_type; static const struct nft_expr_ops nft_osf_op = { .eval = nft_osf_eval, .size = NFT_EXPR_SIZE(sizeof(struct nft_osf)), .init = nft_osf_init, .dump = nft_osf_dump, .type = &nft_osf_type, .validate = nft_osf_validate, .reduce = nft_osf_reduce, }; static struct nft_expr_type nft_osf_type __read_mostly = { .ops = &nft_osf_op, .name = "osf", .owner = THIS_MODULE, .policy = nft_osf_policy, .maxattr = NFTA_OSF_MAX, }; static int __init nft_osf_module_init(void) { return nft_register_expr(&nft_osf_type); } static void __exit nft_osf_module_exit(void) { return nft_unregister_expr(&nft_osf_type); } module_init(nft_osf_module_init); module_exit(nft_osf_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("osf"); MODULE_DESCRIPTION("nftables passive OS fingerprint support");
1519 219 219 590 1098 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 #undef TRACE_SYSTEM #define TRACE_SYSTEM neigh #if !defined(_TRACE_NEIGH_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NEIGH_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <net/neighbour.h> #define neigh_state_str(state) \ __print_symbolic(state, \ { NUD_INCOMPLETE, "incomplete" }, \ { NUD_REACHABLE, "reachable" }, \ { NUD_STALE, "stale" }, \ { NUD_DELAY, "delay" }, \ { NUD_PROBE, "probe" }, \ { NUD_FAILED, "failed" }, \ { NUD_NOARP, "noarp" }, \ { NUD_PERMANENT, "permanent"}) TRACE_EVENT(neigh_create, TP_PROTO(struct neigh_table *tbl, struct net_device *dev, const void *pkey, const struct neighbour *n, bool exempt_from_gc), TP_ARGS(tbl, dev, pkey, n, exempt_from_gc), TP_STRUCT__entry( __field(u32, family) __string(dev, dev ? dev->name : "NULL") __field(int, entries) __field(u8, created) __field(u8, gc_exempt) __array(u8, primary_key4, 4) __array(u8, primary_key6, 16) ), TP_fast_assign( __be32 *p32; __entry->family = tbl->family; __assign_str(dev); __entry->entries = atomic_read(&tbl->gc_entries); __entry->created = n != NULL; __entry->gc_exempt = exempt_from_gc; p32 = (__be32 *)__entry->primary_key4; if (tbl->family == AF_INET) *p32 = *(__be32 *)pkey; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (tbl->family == AF_INET6) { struct in6_addr *pin6; pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)pkey; } #endif ), TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d", __entry->family, __get_str(dev), __entry->entries, __entry->primary_key4, __entry->primary_key6, __entry->created, __entry->gc_exempt) ); TRACE_EVENT(neigh_update, TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid), TP_ARGS(n, lladdr, new, flags, nlmsg_pid), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __array(u8, new_lladdr, MAX_ADDR_LEN) __field(u8, new_state) __field(u32, update_flags) __field(u32, pid) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; if (lladdr) memcpy(__entry->new_lladdr, lladdr, lladdr_len); __entry->new_state = new; __entry->update_flags = flags; __entry->pid = nlmsg_pid; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu new_lladdr %s " "new_state %s update_flags %02x pid %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __print_hex_str(__entry->new_lladdr, __entry->lladdr_len), neigh_state_str(__entry->new_state), __entry->update_flags, __entry->pid) ); DECLARE_EVENT_CLASS(neigh__update, TP_PROTO(struct neighbour *n, int err), TP_ARGS(n, err), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __field(u32, err) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; __entry->err = err; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu err %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __entry->err) ); DEFINE_EVENT(neigh__update, neigh_update_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_timer_handler, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_dead, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_cleanup_and_release, TP_PROTO(struct neighbour *neigh, int rc), TP_ARGS(neigh, rc) ); #endif /* _TRACE_NEIGH_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
4 4 58 4 26 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/slab.h> #include <linux/regset.h> static int __regset_get(struct task_struct *target, const struct user_regset *regset, unsigned int size, void **data) { void *p = *data, *to_free = NULL; int res; if (!regset->regset_get) return -EOPNOTSUPP; if (size > regset->n * regset->size) size = regset->n * regset->size; if (!p) { to_free = p = kvzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; } res = regset->regset_get(target, regset, (struct membuf){.p = p, .left = size}); if (res < 0) { kvfree(to_free); return res; } *data = p; return size - res; } int regset_get(struct task_struct *target, const struct user_regset *regset, unsigned int size, void *data) { return __regset_get(target, regset, size, &data); } EXPORT_SYMBOL(regset_get); int regset_get_alloc(struct task_struct *target, const struct user_regset *regset, unsigned int size, void **data) { *data = NULL; return __regset_get(target, regset, size, data); } EXPORT_SYMBOL(regset_get_alloc); /** * copy_regset_to_user - fetch a thread's user_regset data into user memory * @target: thread to be examined * @view: &struct user_regset_view describing user thread machine state * @setno: index in @view->regsets * @offset: offset into the regset data, in bytes * @size: amount of data to copy, in bytes * @data: user-mode pointer to copy into */ int copy_regset_to_user(struct task_struct *target, const struct user_regset_view *view, unsigned int setno, unsigned int offset, unsigned int size, void __user *data) { const struct user_regset *regset = &view->regsets[setno]; void *buf; int ret; ret = regset_get_alloc(target, regset, size, &buf); if (ret > 0) ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; kvfree(buf); return ret; }
3775 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 // SPDX-License-Identifier: GPL-2.0-only /* * drm_sysfs.c - Modifications to drm_sysfs_class.c to support * extra sysfs attribute from DRM. Normal drm_sysfs_class * does not allow adding attributes. * * Copyright (c) 2004 Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2003-2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2003-2004 IBM Corp. */ #include <linux/acpi.h> #include <linux/component.h> #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/i2c.h> #include <linux/kdev_t.h> #include <linux/pci.h> #include <linux/property.h> #include <linux/slab.h> #include <drm/drm_accel.h> #include <drm/drm_connector.h> #include <drm/drm_device.h> #include <drm/drm_file.h> #include <drm/drm_modes.h> #include <drm/drm_print.h> #include <drm/drm_property.h> #include <drm/drm_sysfs.h> #include <asm/video.h> #include "drm_internal.h" #include "drm_crtc_internal.h" #define to_drm_minor(d) dev_get_drvdata(d) #define to_drm_connector(d) dev_get_drvdata(d) /** * DOC: overview * * DRM provides very little additional support to drivers for sysfs * interactions, beyond just all the standard stuff. Drivers who want to expose * additional sysfs properties and property groups can attach them at either * &drm_device.dev or &drm_connector.kdev. * * Registration is automatically handled when calling drm_dev_register(), or * drm_connector_register() in case of hot-plugged connectors. Unregistration is * also automatically handled by drm_dev_unregister() and * drm_connector_unregister(). */ static struct device_type drm_sysfs_device_minor = { .name = "drm_minor" }; static struct device_type drm_sysfs_device_connector = { .name = "drm_connector", }; struct class *drm_class; #ifdef CONFIG_ACPI static bool drm_connector_acpi_bus_match(struct device *dev) { return dev->type == &drm_sysfs_device_connector; } static struct acpi_device *drm_connector_acpi_find_companion(struct device *dev) { struct drm_connector *connector = to_drm_connector(dev); return to_acpi_device_node(connector->fwnode); } static struct acpi_bus_type drm_connector_acpi_bus = { .name = "drm_connector", .match = drm_connector_acpi_bus_match, .find_companion = drm_connector_acpi_find_companion, }; static void drm_sysfs_acpi_register(void) { register_acpi_bus_type(&drm_connector_acpi_bus); } static void drm_sysfs_acpi_unregister(void) { unregister_acpi_bus_type(&drm_connector_acpi_bus); } #else static void drm_sysfs_acpi_register(void) { } static void drm_sysfs_acpi_unregister(void) { } #endif static char *drm_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "dri/%s", dev_name(dev)); } static int typec_connector_bind(struct device *dev, struct device *typec_connector, void *data) { int ret; ret = sysfs_create_link(&dev->kobj, &typec_connector->kobj, "typec_connector"); if (ret) return ret; ret = sysfs_create_link(&typec_connector->kobj, &dev->kobj, "drm_connector"); if (ret) sysfs_remove_link(&dev->kobj, "typec_connector"); return ret; } static void typec_connector_unbind(struct device *dev, struct device *typec_connector, void *data) { sysfs_remove_link(&typec_connector->kobj, "drm_connector"); sysfs_remove_link(&dev->kobj, "typec_connector"); } static const struct component_ops typec_connector_ops = { .bind = typec_connector_bind, .unbind = typec_connector_unbind, }; static CLASS_ATTR_STRING(version, S_IRUGO, "drm 1.1.0 20060810"); /** * drm_sysfs_init - initialize sysfs helpers * * This is used to create the DRM class, which is the implicit parent of any * other top-level DRM sysfs objects. * * You must call drm_sysfs_destroy() to release the allocated resources. * * Return: 0 on success, negative error code on failure. */ int drm_sysfs_init(void) { int err; drm_class = class_create("drm"); if (IS_ERR(drm_class)) return PTR_ERR(drm_class); err = class_create_file(drm_class, &class_attr_version.attr); if (err) { class_destroy(drm_class); drm_class = NULL; return err; } drm_class->devnode = drm_devnode; drm_sysfs_acpi_register(); return 0; } /** * drm_sysfs_destroy - destroys DRM class * * Destroy the DRM device class. */ void drm_sysfs_destroy(void) { if (IS_ERR_OR_NULL(drm_class)) return; drm_sysfs_acpi_unregister(); class_remove_file(drm_class, &class_attr_version.attr); class_destroy(drm_class); drm_class = NULL; } static void drm_sysfs_release(struct device *dev) { kfree(dev); } /* * Connector properties */ static ssize_t status_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { struct drm_connector *connector = to_drm_connector(device); struct drm_device *dev = connector->dev; enum drm_connector_force old_force; int ret; ret = mutex_lock_interruptible(&dev->mode_config.mutex); if (ret) return ret; old_force = connector->force; if (sysfs_streq(buf, "detect")) connector->force = 0; else if (sysfs_streq(buf, "on")) connector->force = DRM_FORCE_ON; else if (sysfs_streq(buf, "on-digital")) connector->force = DRM_FORCE_ON_DIGITAL; else if (sysfs_streq(buf, "off")) connector->force = DRM_FORCE_OFF; else ret = -EINVAL; if (old_force != connector->force || !connector->force) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] force updated from %d to %d or reprobing\n", connector->base.id, connector->name, old_force, connector->force); connector->funcs->fill_modes(connector, dev->mode_config.max_width, dev->mode_config.max_height); } mutex_unlock(&dev->mode_config.mutex); return ret ? ret : count; } static ssize_t status_show(struct device *device, struct device_attribute *attr, char *buf) { struct drm_connector *connector = to_drm_connector(device); enum drm_connector_status status; status = READ_ONCE(connector->status); return sysfs_emit(buf, "%s\n", drm_get_connector_status_name(status)); } static ssize_t dpms_show(struct device *device, struct device_attribute *attr, char *buf) { struct drm_connector *connector = to_drm_connector(device); int dpms; dpms = READ_ONCE(connector->dpms); return sysfs_emit(buf, "%s\n", drm_get_dpms_name(dpms)); } static ssize_t enabled_show(struct device *device, struct device_attribute *attr, char *buf) { struct drm_connector *connector = to_drm_connector(device); bool enabled; enabled = READ_ONCE(connector->encoder); return sysfs_emit(buf, enabled ? "enabled\n" : "disabled\n"); } static ssize_t edid_show(struct file *filp, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *connector_dev = kobj_to_dev(kobj); struct drm_connector *connector = to_drm_connector(connector_dev); ssize_t ret; ret = drm_edid_connector_property_show(connector, buf, off, count); return ret; } static ssize_t modes_show(struct device *device, struct device_attribute *attr, char *buf) { struct drm_connector *connector = to_drm_connector(device); struct drm_display_mode *mode; int written = 0; mutex_lock(&connector->dev->mode_config.mutex); list_for_each_entry(mode, &connector->modes, head) { written += scnprintf(buf + written, PAGE_SIZE - written, "%s\n", mode->name); } mutex_unlock(&connector->dev->mode_config.mutex); return written; } static ssize_t connector_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct drm_connector *connector = to_drm_connector(device); return sysfs_emit(buf, "%d\n", connector->base.id); } static DEVICE_ATTR_RW(status); static DEVICE_ATTR_RO(enabled); static DEVICE_ATTR_RO(dpms); static DEVICE_ATTR_RO(modes); static DEVICE_ATTR_RO(connector_id); static struct attribute *connector_dev_attrs[] = { &dev_attr_status.attr, &dev_attr_enabled.attr, &dev_attr_dpms.attr, &dev_attr_modes.attr, &dev_attr_connector_id.attr, NULL }; static const struct bin_attribute edid_attr = { .attr.name = "edid", .attr.mode = 0444, .size = 0, .read = edid_show, }; static const struct bin_attribute *const connector_bin_attrs[] = { &edid_attr, NULL }; static const struct attribute_group connector_dev_group = { .attrs = connector_dev_attrs, .bin_attrs = connector_bin_attrs, }; static const struct attribute_group *connector_dev_groups[] = { &connector_dev_group, NULL }; int drm_sysfs_connector_add(struct drm_connector *connector) { struct drm_device *dev = connector->dev; struct device *kdev; int r; if (connector->kdev) return 0; kdev = kzalloc(sizeof(*kdev), GFP_KERNEL); if (!kdev) return -ENOMEM; device_initialize(kdev); kdev->class = drm_class; kdev->type = &drm_sysfs_device_connector; kdev->parent = dev->primary->kdev; kdev->groups = connector_dev_groups; kdev->release = drm_sysfs_release; dev_set_drvdata(kdev, connector); r = dev_set_name(kdev, "card%d-%s", dev->primary->index, connector->name); if (r) goto err_free; drm_dbg_kms(dev, "[CONNECTOR:%d:%s] adding connector to sysfs\n", connector->base.id, connector->name); r = device_add(kdev); if (r) { drm_err(dev, "failed to register connector device: %d\n", r); goto err_free; } connector->kdev = kdev; if (dev_fwnode(kdev)) { r = component_add(kdev, &typec_connector_ops); if (r) drm_err(dev, "failed to add component to create link to typec connector\n"); } return 0; err_free: put_device(kdev); return r; } int drm_sysfs_connector_add_late(struct drm_connector *connector) { if (connector->ddc) return sysfs_create_link(&connector->kdev->kobj, &connector->ddc->dev.kobj, "ddc"); return 0; } void drm_sysfs_connector_remove_early(struct drm_connector *connector) { if (connector->ddc) sysfs_remove_link(&connector->kdev->kobj, "ddc"); } void drm_sysfs_connector_remove(struct drm_connector *connector) { if (!connector->kdev) return; if (dev_fwnode(connector->kdev)) component_del(connector->kdev, &typec_connector_ops); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] removing connector from sysfs\n", connector->base.id, connector->name); device_unregister(connector->kdev); connector->kdev = NULL; } void drm_sysfs_lease_event(struct drm_device *dev) { char *event_string = "LEASE=1"; char *envp[] = { event_string, NULL }; drm_dbg_lease(dev, "generating lease event\n"); kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); } /** * drm_sysfs_hotplug_event - generate a DRM uevent * @dev: DRM device * * Send a uevent for the DRM device specified by @dev. Currently we only * set HOTPLUG=1 in the uevent environment, but this could be expanded to * deal with other types of events. * * Any new uapi should be using the drm_sysfs_connector_status_event() * for uevents on connector status change. */ void drm_sysfs_hotplug_event(struct drm_device *dev) { char *event_string = "HOTPLUG=1"; char *envp[] = { event_string, NULL }; drm_dbg_kms(dev, "generating hotplug event\n"); kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); } EXPORT_SYMBOL(drm_sysfs_hotplug_event); /** * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector * change * @connector: connector which has changed * * Send a uevent for the DRM connector specified by @connector. This will send * a uevent with the properties HOTPLUG=1 and CONNECTOR. */ void drm_sysfs_connector_hotplug_event(struct drm_connector *connector) { struct drm_device *dev = connector->dev; char hotplug_str[] = "HOTPLUG=1", conn_id[21]; char *envp[] = { hotplug_str, conn_id, NULL }; snprintf(conn_id, sizeof(conn_id), "CONNECTOR=%u", connector->base.id); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] generating connector hotplug event\n", connector->base.id, connector->name); kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); } EXPORT_SYMBOL(drm_sysfs_connector_hotplug_event); /** * drm_sysfs_connector_property_event - generate a DRM uevent for connector * property change * @connector: connector on which property changed * @property: connector property which has changed. * * Send a uevent for the specified DRM connector and property. Currently we * set HOTPLUG=1 and connector id along with the attached property id * related to the change. */ void drm_sysfs_connector_property_event(struct drm_connector *connector, struct drm_property *property) { struct drm_device *dev = connector->dev; char hotplug_str[] = "HOTPLUG=1", conn_id[21], prop_id[21]; char *envp[4] = { hotplug_str, conn_id, prop_id, NULL }; WARN_ON(!drm_mode_obj_find_prop_id(&connector->base, property->base.id)); snprintf(conn_id, ARRAY_SIZE(conn_id), "CONNECTOR=%u", connector->base.id); snprintf(prop_id, ARRAY_SIZE(prop_id), "PROPERTY=%u", property->base.id); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] generating connector property event for [PROP:%d:%s]\n", connector->base.id, connector->name, property->base.id, property->name); kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); } EXPORT_SYMBOL(drm_sysfs_connector_property_event); static ssize_t boot_display_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "1\n"); } static DEVICE_ATTR_RO(boot_display); static struct attribute *display_attrs[] = { &dev_attr_boot_display.attr, NULL }; static umode_t boot_display_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj)->parent; if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); if (video_is_primary_device(&pdev->dev)) return a->mode; } return 0; } static const struct attribute_group display_attr_group = { .attrs = display_attrs, .is_visible = boot_display_visible, }; static const struct attribute_group *card_dev_groups[] = { &display_attr_group, NULL }; struct device *drm_sysfs_minor_alloc(struct drm_minor *minor) { const char *minor_str; struct device *kdev; int r; kdev = kzalloc(sizeof(*kdev), GFP_KERNEL); if (!kdev) return ERR_PTR(-ENOMEM); device_initialize(kdev); if (minor->type == DRM_MINOR_ACCEL) { minor_str = "accel%d"; accel_set_device_instance_params(kdev, minor->index); } else { if (minor->type == DRM_MINOR_RENDER) minor_str = "renderD%d"; else minor_str = "card%d"; kdev->devt = MKDEV(DRM_MAJOR, minor->index); kdev->class = drm_class; kdev->groups = card_dev_groups; kdev->type = &drm_sysfs_device_minor; } kdev->parent = minor->dev->dev; kdev->release = drm_sysfs_release; dev_set_drvdata(kdev, minor); r = dev_set_name(kdev, minor_str, minor->index); if (r < 0) goto err_free; return kdev; err_free: put_device(kdev); return ERR_PTR(r); } /** * drm_class_device_register - register new device with the DRM sysfs class * @dev: device to register * * Registers a new &struct device within the DRM sysfs class. Essentially only * used by ttm to have a place for its global settings. Drivers should never use * this. */ int drm_class_device_register(struct device *dev) { if (!drm_class || IS_ERR(drm_class)) return -ENOENT; dev->class = drm_class; return device_register(dev); } EXPORT_SYMBOL_GPL(drm_class_device_register); /** * drm_class_device_unregister - unregister device with the DRM sysfs class * @dev: device to unregister * * Unregisters a &struct device from the DRM sysfs class. Essentially only used * by ttm to have a place for its global settings. Drivers should never use * this. */ void drm_class_device_unregister(struct device *dev) { return device_unregister(dev); } EXPORT_SYMBOL_GPL(drm_class_device_unregister);
1883 56150 48219 38389 99 50313 23100 27572 23259 23261 14 11084 11080 138 1665 1733 173 125 223 1642 1753 3 72 3609 1550 610 94 2 847 5 22680 8711 4117 320 1447 10796 25666 22073 17 7 5089 2709 369 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/container_of.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <asm/barrier.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ /** * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself * @name: name of the list_head */ #define LIST_HEAD_INIT(name) { &(name), &(name) } /** * LIST_HEAD - definition of a &struct list_head with initialization values * @name: name of the list_head */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_LIST_HARDENED #ifdef CONFIG_DEBUG_LIST # define __list_valid_slowpath #else # define __list_valid_slowpath __cold __preserve_most #endif /* * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_add_valid_or_report(). */ static __always_inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { /* * With the hardening version, elide checking if next and prev * are NULL, since the immediate dereference of them below would * result in a fault if NULL. * * With the reduced set of checks, we can afford to inline the * checks, which also gives the compiler a chance to elide some * of them completely if they can be proven at compile-time. If * one of the pre-conditions does not hold, the slow-path will * show a report which pre-condition failed. */ if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) return true; ret = false; } ret &= __list_add_valid_or_report(new, prev, next); return ret; } /* * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_del_entry_valid_or_report(). */ static __always_inline bool __list_del_entry_valid(struct list_head *entry) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { struct list_head *prev = entry->prev; struct list_head *next = entry->next; /* * With the hardening version, elide checking if next and prev * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate * dereference of them below would result in a fault. */ if (likely(prev->next == entry && next->prev == entry)) return true; ret = false; } ret &= __list_del_entry_valid_or_report(entry); return ret; } #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_last_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->prev); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ list_is_head(&pos->member, (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } /** * hlist_splice_init() - move all entries from one list to another * @from: hlist_head from which entries will be moved * @last: last entry on the @from list * @to: hlist_head to which entries will be moved * * @to can be empty, @from must contain at least @last. */ static inline void hlist_splice_init(struct hlist_head *from, struct hlist_node *last, struct hlist_head *to) { if (to->first) to->first->pprev = &last->next; last->next = to->first; to->first = from->first; from->first->pprev = &to->first; from->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) /** * hlist_count_nodes - count nodes in the hlist * @head: the head for your hlist. */ static inline size_t hlist_count_nodes(struct hlist_head *head) { struct hlist_node *pos; size_t count = 0; hlist_for_each(pos, head) count++; return count; } #endif
23639 47 47 2 2 26 31 11780 26 1 1291 30 52 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H /* * seqcount_t / seqlock_t - a reader-writer consistency mechanism with * lockless readers (read-only retry loops), and no writer starvation. * * See Documentation/locking/seqlock.rst * * Copyrights: * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH */ #include <linux/compiler.h> #include <linux/kcsan-checks.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/preempt.h> #include <linux/seqlock_types.h> #include <linux/spinlock.h> #include <asm/processor.h> /* * The seqlock seqcount_t interface does not prescribe a precise sequence of * read begin/retry/end. For readers, typically there is a call to * read_seqcount_begin() and read_seqcount_retry(), however, there are more * esoteric cases which do not follow this pattern. * * As a consequence, we take the following best-effort approach for raw usage * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following * memory operations are considered atomic. Usage of the seqlock_t interface * is not affected. */ #define KCSAN_SEQLOCK_REGION_MAX 1000 static inline void __seqcount_init(seqcount_t *s, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ lockdep_init_map(&s->dep_map, name, key, 0); s->sequence = 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC # define SEQCOUNT_DEP_MAP_INIT(lockname) \ .dep_map = { .name = #lockname } /** * seqcount_init() - runtime initializer for seqcount_t * @s: Pointer to the seqcount_t instance */ # define seqcount_init(s) \ do { \ static struct lock_class_key __key; \ __seqcount_init((s), #s, &__key); \ } while (0) static inline void seqcount_lockdep_reader_access(const seqcount_t *s) { seqcount_t *l = (seqcount_t *)s; unsigned long flags; local_irq_save(flags); seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_); seqcount_release(&l->dep_map, _RET_IP_); local_irq_restore(flags); } #else # define SEQCOUNT_DEP_MAP_INIT(lockname) # define seqcount_init(s) __seqcount_init(s, NULL, NULL) # define seqcount_lockdep_reader_access(x) #endif /** * SEQCNT_ZERO() - static initializer for seqcount_t * @name: Name of the seqcount_t instance */ #define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } /* * Sequence counters with associated locks (seqcount_LOCKNAME_t) * * A sequence counter which associates the lock used for writer * serialization at initialization time. This enables lockdep to validate * that the write side critical section is properly serialized. * * For associated locks which do not implicitly disable preemption, * preemption protection is enforced in the write side function. * * Lockdep is never used in any for the raw write variants. * * See Documentation/locking/seqlock.rst */ /* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter * @lock: Pointer to the associated lock * * A plain sequence counter with external writer synchronization by * LOCKNAME @lock. The lock is associated to the sequence counter in the * static initializer or init function. This enables lockdep to validate * that the write side critical section is properly serialized. * * LOCKNAME: raw_spinlock, spinlock, rwlock or mutex */ /* * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t * @s: Pointer to the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated lock */ #define seqcount_LOCKNAME_init(s, _lock, lockname) \ do { \ seqcount_##lockname##_t *____s = (s); \ seqcount_init(&____s->seqcount); \ __SEQ_LOCK(____s->lock = (_lock)); \ } while (0) #define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock) #define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock) #define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock) #define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex) /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t * * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above locktype * @lockbase: prefix for associated lock/unlock */ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline const seqcount_t * \ __seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ \ if (preemptible && unlikely(seq & 1)) { \ __SEQ_LOCK(lockbase##_lock(s->lock)); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ \ /* \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ seq = smp_load_acquire(&s->seqcount.sequence); \ } \ \ return seq; \ } \ \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return preemptible; \ \ /* PREEMPT_RT relies on the above LOCK+UNLOCK */ \ return false; \ } \ \ static __always_inline void \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ __SEQ_LOCK(lockdep_assert_held(s->lock)); \ } /* * __seqprop() for seqcount_t */ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) { return s; } static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s) { return s; } static inline unsigned __seqprop_sequence(const seqcount_t *s) { return smp_load_acquire(&s->sequence); } static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; } static inline void __seqprop_assert(const seqcount_t *s) { lockdep_assert_preemption_disabled(); } #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) #undef SEQCOUNT_LOCKNAME /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * @name: Name of the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated LOCKNAME */ #define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ __SEQ_LOCK(.lock = (assoc_lock)) \ } #define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ seqcount_##lockname##_t: __seqprop_##lockname##_##prop #define __seqprop(s, prop) _Generic(*(s), \ seqcount_t: __seqprop_##prop, \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ __seqprop_case((s), mutex, prop)) #define seqprop_ptr(s) __seqprop(s, ptr)(s) #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) #define seqprop_sequence(s) __seqprop(s, sequence)(s) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) #define seqprop_assert(s) __seqprop(s, assert)(s) /** * __read_seqcount_begin() - begin a seqcount_t read section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define __read_seqcount_begin(s) \ ({ \ unsigned __seq; \ \ while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount_begin(s) __read_seqcount_begin(s) /** * read_seqcount_begin() - begin a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define read_seqcount_begin(s) \ ({ \ seqcount_lockdep_reader_access(seqprop_const_ptr(s)); \ raw_read_seqcount_begin(s); \ }) /** * raw_read_seqcount() - read the raw seqcount_t counter value * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_read_seqcount opens a read critical section of the given * seqcount_t, without any lockdep checking, and without checking or * masking the sequence counter LSB. Calling code is responsible for * handling that. * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount(s) \ ({ \ unsigned __seq = seqprop_sequence(s); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_seqcount_try_begin() - begin a seqcount_t read critical section * w/o lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count to be passed to read_seqcount_retry() * * Similar to raw_seqcount_begin(), except it enables eliding the critical * section entirely if odd, instead of doing the speculation knowing it will * fail. * * Useful when counter stabilization is more or less equivalent to taking * the lock and there is a slowpath that does that. * * If true, start will be set to the (even) sequence count read. * * Return: true when a read critical section is started. */ #define raw_seqcount_try_begin(s, start) \ ({ \ start = raw_read_seqcount(s); \ !(start & 1); \ }) /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_seqcount_begin opens a read critical section of the given * seqcount_t. Unlike read_seqcount_begin(), this function will not wait * for the count to stabilize. If a writer is active when it begins, it * will fail the read_seqcount_retry() at the end of the read critical * section instead of stabilizing at the beginning of it. * * Use this only in special kernel hot paths where the read section is * small and has a high probability of success through other external * means. It will save a single branching instruction. * * Return: count to be passed to read_seqcount_retry() */ #define raw_seqcount_begin(s) \ ({ \ /* \ * If the counter is odd, let read_seqcount_retry() fail \ * by decrementing the counter. \ */ \ raw_read_seqcount(s) & ~1; \ }) /** * __read_seqcount_retry() - end a seqcount_t read section w/o barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is * provided before actually loading any of the variables that are to be * protected in this critical section. * * Use carefully, only in critical code, and comment how the barrier is * provided. * * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ do___read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) { kcsan_atomic_next(0); return unlikely(READ_ONCE(s->sequence) != start); } /** * read_seqcount_retry() - end a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * read_seqcount_retry closes the read critical section of given * seqcount_t. If the critical section was invalid, it must be ignored * (and typically retried). * * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ do_read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) { smp_rmb(); return do___read_seqcount_retry(s, start); } /** * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_begin() */ #define raw_write_seqcount_begin(s) \ do { \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_raw_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_raw_write_seqcount_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); } /** * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_end() */ #define raw_write_seqcount_end(s) \ do { \ do_raw_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_raw_write_seqcount_end(seqcount_t *s) { smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_begin_nested() - start a seqcount_t write section with * custom lockdep nesting level * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @subclass: lockdep nesting level * * See Documentation/locking/lockdep-design.rst * Context: check write_seqcount_begin() */ #define write_seqcount_begin_nested(s, subclass) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin_nested(seqprop_ptr(s), subclass); \ } while (0) static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass) { seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); do_raw_write_seqcount_begin(s); } /** * write_seqcount_begin() - start a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: sequence counter write side sections must be serialized and * non-preemptible. Preemption will be automatically disabled if and * only if the seqcount write serialization lock is associated, and * preemptible. If readers can be invoked from hardirq or softirq * context, interrupts or bottom halves must be respectively disabled. */ #define write_seqcount_begin(s) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_write_seqcount_begin(seqcount_t *s) { do_write_seqcount_begin_nested(s, 0); } /** * write_seqcount_end() - end a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: Preemption will be automatically re-enabled if and only if * the seqcount write serialization lock is associated, and preemptible. */ #define write_seqcount_end(s) \ do { \ do_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_write_seqcount_end(seqcount_t *s) { seqcount_release(&s->dep_map, _RET_IP_); do_raw_write_seqcount_end(s); } /** * raw_write_seqcount_barrier() - do a seqcount_t write barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * This can be used to provide an ordering guarantee instead of the usual * consistency guarantee. It is one wmb cheaper, because it can collapse * the two back-to-back wmb()s. * * Note that writes surrounding the barrier should be declared atomic (e.g. * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because * neither writes before nor after the barrier are enclosed in a seq-writer * critical section that would ensure readers are aware of ongoing writes:: * * seqcount_t seq; * bool X = true, Y = false; * * void read(void) * { * bool x, y; * * do { * int s = read_seqcount_begin(&seq); * * x = X; y = Y; * * } while (read_seqcount_retry(&seq, s)); * * BUG_ON(!x && !y); * } * * void write(void) * { * WRITE_ONCE(Y, true); * * raw_write_seqcount_barrier(seq); * * WRITE_ONCE(X, false); * } */ #define raw_write_seqcount_barrier(s) \ do_raw_write_seqcount_barrier(seqprop_ptr(s)) static inline void do_raw_write_seqcount_barrier(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_invalidate() - invalidate in-progress seqcount_t read * side operations * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * After write_seqcount_invalidate, no seqcount_t read side operations * will complete successfully and see data older than this. */ #define write_seqcount_invalidate(s) \ do_write_seqcount_invalidate(seqprop_ptr(s)) static inline void do_write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); kcsan_nestable_atomic_begin(); s->sequence+=2; kcsan_nestable_atomic_end(); } /* * Latch sequence counters (seqcount_latch_t) * * A sequence counter variant where the counter even/odd value is used to * switch between two copies of protected data. This allows the read path, * typically NMIs, to safely interrupt the write side critical section. * * As the write sections are fully preemptible, no special handling for * PREEMPT_RT is needed. */ typedef struct { seqcount_t seqcount; } seqcount_latch_t; /** * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t * @seq_name: Name of the seqcount_latch_t instance */ #define SEQCNT_LATCH_ZERO(seq_name) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ } /** * seqcount_latch_init() - runtime initializer for seqcount_latch_t * @s: Pointer to the seqcount_latch_t instance */ #define seqcount_latch_init(s) seqcount_init(&(s)->seqcount) /** * raw_read_seqcount_latch() - pick even/odd latch data copy * @s: Pointer to seqcount_latch_t * * See raw_write_seqcount_latch() for details and a full reader/writer * usage example. * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked * with raw_read_seqcount_latch_retry(). */ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) { /* * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). * Due to the dependent load, a full smp_rmb() is not needed. */ return READ_ONCE(s->seqcount.sequence); } /** * read_seqcount_latch() - pick even/odd latch data copy * @s: Pointer to seqcount_latch_t * * See write_seqcount_latch() for details and a full reader/writer usage * example. * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked * with read_seqcount_latch_retry(). */ static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s) { kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); return raw_read_seqcount_latch(s); } /** * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from raw_read_seqcount_latch() * * Return: true if a read section retry is required, else false */ static __always_inline int raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { smp_rmb(); return unlikely(READ_ONCE(s->seqcount.sequence) != start); } /** * read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from read_seqcount_latch() * * Return: true if a read section retry is required, else false */ static __always_inline int read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { kcsan_atomic_next(0); return raw_read_seqcount_latch_retry(s, start); } /** * raw_write_seqcount_latch() - redirect latch readers to even/odd copy * @s: Pointer to seqcount_latch_t */ static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s) { smp_wmb(); /* prior stores before incrementing "sequence" */ s->seqcount.sequence++; smp_wmb(); /* increment "sequence" before following stores */ } /** * write_seqcount_latch_begin() - redirect latch readers to odd copy * @s: Pointer to seqcount_latch_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never * interrupt the modification -- e.g. the concurrency is strictly between CPUs * -- you most likely do not need this. * * Where the traditional RCU/lockless data structures rely on atomic * modifications to ensure queries observe either the old or the new state the * latch allows the same for non-atomic updates. The trade-off is doubling the * cost of storage; we have to maintain two copies of the entire data * structure. * * Very simply put: we first modify one copy and then the other. This ensures * there is always one copy in a stable state, ready to give us an answer. * * The basic form is a data structure like:: * * struct latch_struct { * seqcount_latch_t seq; * struct data_struct data[2]; * }; * * Where a modification, which is assumed to be externally serialized, does the * following:: * * void latch_modify(struct latch_struct *latch, ...) * { * write_seqcount_latch_begin(&latch->seq); * modify(latch->data[0], ...); * write_seqcount_latch(&latch->seq); * modify(latch->data[1], ...); * write_seqcount_latch_end(&latch->seq); * } * * The query will have a form like:: * * struct entry *latch_query(struct latch_struct *latch, ...) * { * struct entry *entry; * unsigned seq, idx; * * do { * seq = read_seqcount_latch(&latch->seq); * * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); * * // This includes needed smp_rmb() * } while (read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } * * So during the modification, queries are first redirected to data[1]. Then we * modify data[0]. When that is complete, we redirect queries back to data[0] * and we can modify data[1]. * * NOTE: * * The non-requirement for atomic modifications does _NOT_ include * the publishing of new entries in the case where data is a dynamic * data structure. * * An iteration might start in data[0] and get suspended long enough * to miss an entire modification sequence, once it resumes it might * observe the new entry. * * NOTE2: * * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. */ static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s) { kcsan_nestable_atomic_begin(); raw_write_seqcount_latch(s); } /** * write_seqcount_latch() - redirect latch readers to even copy * @s: Pointer to seqcount_latch_t */ static __always_inline void write_seqcount_latch(seqcount_latch_t *s) { raw_write_seqcount_latch(s); } /** * write_seqcount_latch_end() - end a seqcount_latch_t write section * @s: Pointer to seqcount_latch_t * * Marks the end of a seqcount_latch_t writer section, after all copies of the * latch-protected data have been updated. */ static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s) { kcsan_nestable_atomic_end(); } #define __SEQLOCK_UNLOCKED(lockname) \ { \ .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ .lock = __SPIN_LOCK_UNLOCKED(lockname) \ } /** * seqlock_init() - dynamic initializer for seqlock_t * @sl: Pointer to the seqlock_t instance */ #define seqlock_init(sl) \ do { \ spin_lock_init(&(sl)->lock); \ seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock); \ } while (0) /** * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t * @sl: Name of the seqlock_t instance */ #define DEFINE_SEQLOCK(sl) \ seqlock_t sl = __SEQLOCK_UNLOCKED(sl) /** * read_seqbegin() - start a seqlock_t read side critical section * @sl: Pointer to seqlock_t * * Return: count, to be passed to read_seqretry() */ static inline unsigned read_seqbegin(const seqlock_t *sl) { return read_seqcount_begin(&sl->seqcount); } /** * read_seqretry() - end a seqlock_t read side section * @sl: Pointer to seqlock_t * @start: count, from read_seqbegin() * * read_seqretry closes the read side critical section of given seqlock_t. * If the critical section was invalid, it must be ignored (and typically * retried). * * Return: true if a read section retry is required, else false */ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { return read_seqcount_retry(&sl->seqcount, start); } /* * For all seqlock_t write side functions, use the internal * do_write_seqcount_begin() instead of generic write_seqcount_begin(). * This way, no redundant lockdep_assert_held() checks are added. */ /** * write_seqlock() - start a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_seqlock opens a write side critical section for the given * seqlock_t. It also implicitly acquires the spinlock_t embedded inside * that sequential lock. All seqlock_t write side sections are thus * automatically serialized and non-preemptible. * * Context: if the seqlock_t read section, or other write side critical * sections, can be invoked from hardirq or softirq contexts, use the * _irqsave or _bh variants of this function instead. */ static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock() - end a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_sequnlock closes the (serialized and non-preemptible) write side * critical section of given seqlock_t. */ static inline void write_sequnlock(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock(&sl->lock); } /** * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * _bh variant of write_seqlock(). Use only if the read side section, or * other write side sections, can be invoked from softirq contexts. */ static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_bh closes the serialized, non-preemptible, and * softirqs-disabled, seqlock_t write side critical section opened with * write_seqlock_bh(). */ static inline void write_sequnlock_bh(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_bh(&sl->lock); } /** * write_seqlock_irq() - start a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * _irq variant of write_seqlock(). Use only if the read side section, or * other write sections, can be invoked from hardirq contexts. */ static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_irq() - end a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_irq closes the serialized and non-interruptible * seqlock_t write side section opened with write_seqlock_irq(). */ static inline void write_sequnlock_irq(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irq(&sl->lock); } static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); do_write_seqcount_begin(&sl->seqcount.seqcount); return flags; } /** * write_seqlock_irqsave() - start a non-interruptible seqlock_t write * section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to write_sequnlock_irqrestore(). * * _irqsave variant of write_seqlock(). Use it only if the read side * section, or other write sections, can be invoked from hardirq context. */ #define write_seqlock_irqsave(lock, flags) \ do { flags = __write_seqlock_irqsave(lock); } while (0) /** * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write * section * @sl: Pointer to seqlock_t * @flags: Caller's saved interrupt state, from write_seqlock_irqsave() * * write_sequnlock_irqrestore closes the serialized and non-interruptible * seqlock_t write section previously opened with write_seqlock_irqsave(). */ static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqlock_excl() - begin a seqlock_t locking reader section * @sl: Pointer to seqlock_t * * read_seqlock_excl opens a seqlock_t locking reader critical section. A * locking reader exclusively locks out *both* other writers *and* other * locking readers, but it does not update the embedded sequence number. * * Locking readers act like a normal spin_lock()/spin_unlock(). * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * The opened read section must be closed with read_sequnlock_excl(). */ static inline void read_seqlock_excl(seqlock_t *sl) { spin_lock(&sl->lock); } /** * read_sequnlock_excl() - end a seqlock_t locking reader critical section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl(seqlock_t *sl) { spin_unlock(&sl->lock); } /** * read_seqlock_excl_bh() - start a seqlock_t locking reader section with * softirqs disabled * @sl: Pointer to seqlock_t * * _bh variant of read_seqlock_excl(). Use this variant only if the * seqlock_t write side section, *or other read sections*, can be invoked * from softirq contexts. */ static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); } /** * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking * reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_bh(seqlock_t *sl) { spin_unlock_bh(&sl->lock); } /** * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking * reader section * @sl: Pointer to seqlock_t * * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ static inline void read_seqlock_excl_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); } /** * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t * locking reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_irq(seqlock_t *sl) { spin_unlock_irq(&sl->lock); } static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); return flags; } /** * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t * locking reader section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to read_sequnlock_excl_irqrestore(). * * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ #define read_seqlock_excl_irqsave(lock, flags) \ do { flags = __read_seqlock_excl_irqsave(lock); } while (0) /** * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t * locking reader section * @sl: Pointer to seqlock_t * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave() */ static inline void read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) { spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader * @lock: Pointer to seqlock_t * @seq : Marker and return parameter. If the passed value is even, the * reader will become a *lockless* seqlock_t reader as in read_seqbegin(). * If the passed value is odd, the reader will become a *locking* reader * as in read_seqlock_excl(). In the first call to this function, the * caller *must* initialize and pass an even value to @seq; this way, a * lockless read can be optimistically tried first. * * read_seqbegin_or_lock is an API designed to optimistically try a normal * lockless seqlock_t read section first. If an odd counter is found, the * lockless read trial has failed, and the next read iteration transforms * itself into a full seqlock_t locking reader. * * This is typically used to avoid seqlock_t lockless readers starvation * (too much retry loops) in the case of a sharp spike in write side * activity. * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * Check Documentation/locking/seqlock.rst for template example code. * * Return: the encountered sequence counter value, through the @seq * parameter, which is overloaded as a return parameter. This returned * value must be checked with need_seqretry(). If the read section need to * be retried, this returned value must also be passed as the @seq * parameter of the next read_seqbegin_or_lock() iteration. */ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) { if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl(lock); } /** * need_seqretry() - validate seqlock_t "locking or lockless" read section * @lock: Pointer to seqlock_t * @seq: sequence count, from read_seqbegin_or_lock() * * Return: true if a read section retry is required, false otherwise */ static inline int need_seqretry(seqlock_t *lock, int seq) { return !(seq & 1) && read_seqretry(lock, seq); } /** * done_seqretry() - end seqlock_t "locking or lockless" reader section * @lock: Pointer to seqlock_t * @seq: count, from read_seqbegin_or_lock() * * done_seqretry finishes the seqlock_t read side critical section started * with read_seqbegin_or_lock() and validated by need_seqretry(). */ static inline void done_seqretry(seqlock_t *lock, int seq) { if (seq & 1) read_sequnlock_excl(lock); } /** * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or * a non-interruptible locking reader * @lock: Pointer to seqlock_t * @seq: Marker and return parameter. Check read_seqbegin_or_lock(). * * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if * the seqlock_t write section, *or other read sections*, can be invoked * from hardirq context. * * Note: Interrupts will be disabled only for "locking reader" mode. * * Return: * * 1. The saved local interrupts state in case of a locking reader, to * be passed to done_seqretry_irqrestore(). * * 2. The encountered sequence counter value, returned through @seq * overloaded as a return parameter. Check read_seqbegin_or_lock(). */ static inline unsigned long read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) { unsigned long flags = 0; if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl_irqsave(lock, flags); return flags; } /** * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a * non-interruptible locking reader section * @lock: Pointer to seqlock_t * @seq: Count, from read_seqbegin_or_lock_irqsave() * @flags: Caller's saved local interrupt state in case of a locking * reader, also from read_seqbegin_or_lock_irqsave() * * This is the _irqrestore variant of done_seqretry(). The read section * must've been opened with read_seqbegin_or_lock_irqsave(), and validated * by need_seqretry(). */ static inline void done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) { if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } enum ss_state { ss_done = 0, ss_lock, ss_lock_irqsave, ss_lockless, }; struct ss_tmp { enum ss_state state; unsigned long data; spinlock_t *lock; spinlock_t *lock_irqsave; }; static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) { if (sst->lock) spin_unlock(sst->lock); if (sst->lock_irqsave) spin_unlock_irqrestore(sst->lock_irqsave, sst->data); } extern void __scoped_seqlock_invalid_target(void); #if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN) /* * For some reason some GCC-8 architectures (nios2, alpha) have trouble * determining that the ss_done state is impossible in __scoped_seqlock_next() * below. * * Similarly KASAN is known to confuse compilers enough to break this. But we * don't care about code quality for KASAN builds anyway. */ static inline void __scoped_seqlock_bug(void) { } #else /* * Canary for compiler optimization -- if the compiler doesn't realize this is * an impossible state, it very likely generates sub-optimal code here. */ extern void __scoped_seqlock_bug(void); #endif static inline void __scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target) { switch (sst->state) { case ss_done: __scoped_seqlock_bug(); return; case ss_lock: case ss_lock_irqsave: sst->state = ss_done; return; case ss_lockless: if (!read_seqretry(lock, sst->data)) { sst->state = ss_done; return; } break; } switch (target) { case ss_done: __scoped_seqlock_invalid_target(); return; case ss_lock: sst->lock = &lock->lock; spin_lock(sst->lock); sst->state = ss_lock; return; case ss_lock_irqsave: sst->lock_irqsave = &lock->lock; spin_lock_irqsave(sst->lock_irqsave, sst->data); sst->state = ss_lock_irqsave; return; case ss_lockless: sst->data = read_seqbegin(lock); return; } } #define __scoped_seqlock_read(_seqlock, _target, _s) \ for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) = \ { .state = ss_lockless, .data = read_seqbegin(_seqlock) }; \ _s.state != ss_done; \ __scoped_seqlock_next(&_s, _seqlock, _target)) /** * scoped_seqlock_read (lock, ss_state) - execute the read side critical * section without manual sequence * counter handling or calls to other * helpers * @lock: pointer to seqlock_t protecting the data * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating * the type of critical read section * * Example: * * scoped_seqlock_read (&lock, ss_lock) { * // read-side critical section * } * * Starts with a lockess pass first. If it fails, restarts the critical * section with the lock held. */ #define scoped_seqlock_read(_seqlock, _target) \ __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock)) #endif /* __LINUX_SEQLOCK_H */
18 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; }
62 1146 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SHRINKER_H #define _LINUX_SHRINKER_H #include <linux/atomic.h> #include <linux/types.h> #include <linux/refcount.h> #include <linux/completion.h> #define SHRINKER_UNIT_BITS BITS_PER_LONG /* * Bitmap and deferred work of shrinker::id corresponding to memcg-aware * shrinkers, which have elements charged to the memcg. */ struct shrinker_info_unit { atomic_long_t nr_deferred[SHRINKER_UNIT_BITS]; DECLARE_BITMAP(map, SHRINKER_UNIT_BITS); }; struct shrinker_info { struct rcu_head rcu; int map_nr_max; struct shrinker_info_unit *unit[]; }; /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. */ struct shrink_control { gfp_t gfp_mask; /* current node being shrunk (for NUMA aware shrinkers) */ int nid; /* * How many objects scan_objects should scan and try to reclaim. * This is reset before every call, so it is safe for callees * to modify. */ unsigned long nr_to_scan; /* * How many objects did scan_objects process? * This defaults to nr_to_scan before every call, but the callee * should track its actual progress. */ unsigned long nr_scanned; /* current memcg being shrunk (for memcg aware shrinkers) */ struct mem_cgroup *memcg; }; #define SHRINK_STOP (~0UL) #define SHRINK_EMPTY (~0UL - 1) /* * A callback you can register to apply pressure to ageable caches. * * @count_objects should return the number of freeable items in the cache. If * there are no objects to free, it should return SHRINK_EMPTY, while 0 is * returned in cases of the number of freeable items cannot be determined * or shrinker should skip this cache for this time (e.g., their number * is below shrinkable limit). No deadlock checks should be done during the * count callback - the shrinker relies on aggregating scan counts that couldn't * be executed due to potential deadlocks to be run at a later call when the * deadlock condition is no longer pending. * * @scan_objects will only be called if @count_objects returned a non-zero * value for the number of freeable objects. The callout should scan the cache * and attempt to free items from the cache. It should then return the number * of objects freed during the scan, or SHRINK_STOP if progress cannot be made * due to potential deadlocks. If SHRINK_STOP is returned, then no further * attempts to call the @scan_objects will be made from the current reclaim * context. * * @flags determine the shrinker abilities, like numa awareness */ struct shrinker { unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc); unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); long batch; /* reclaim batch size, 0 = default */ int seeks; /* seeks to recreate an obj */ unsigned flags; /* * The reference count of this shrinker. Registered shrinker have an * initial refcount of 1, then the lookup operations are now allowed * to use it via shrinker_try_get(). Later in the unregistration step, * the initial refcount will be discarded, and will free the shrinker * asynchronously via RCU after its refcount reaches 0. */ refcount_t refcount; struct completion done; /* use to wait for refcount to reach 0 */ struct rcu_head rcu; void *private_data; /* These are for internal use */ struct list_head list; #ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; #endif #ifdef CONFIG_SHRINKER_DEBUG int debugfs_id; const char *name; struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Internal flags */ #define SHRINKER_REGISTERED BIT(0) #define SHRINKER_ALLOCATED BIT(1) /* Flags for users to use */ #define SHRINKER_NUMA_AWARE BIT(2) #define SHRINKER_MEMCG_AWARE BIT(3) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ #define SHRINKER_NONSLAB BIT(4) __printf(2, 3) struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); static inline bool shrinker_try_get(struct shrinker *shrinker) { return refcount_inc_not_zero(&shrinker->refcount); } static inline void shrinker_put(struct shrinker *shrinker) { if (refcount_dec_and_test(&shrinker->refcount)) complete(&shrinker->done); } #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { return 0; } #endif /* CONFIG_SHRINKER_DEBUG */ #endif /* _LINUX_SHRINKER_H */
105 179 228 455 212 252 6438 6436 5864 5859 6434 6087 5859 4332 4334 4335 3892 21 3892 3892 4081 233 21 3888 3 1 3 3 3 14228 22 60 60 14 60 14175 14228 9360 5427 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 // SPDX-License-Identifier: GPL-2.0 #define CREATE_TRACE_POINTS #include <trace/events/mmap_lock.h> #include <linux/mm.h> #include <linux/cgroup.h> #include <linux/memcontrol.h> #include <linux/mmap_lock.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/trace_events.h> #include <linux/local_lock.h> EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); #ifdef CONFIG_TRACING /* * Trace calls must be in a separate file, as otherwise there's a circular * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. */ void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) { trace_mmap_lock_start_locking(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { trace_mmap_lock_acquire_returned(mm, write, success); } EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) { trace_mmap_lock_released(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) { unsigned int tgt_refcnt = VMA_LOCK_OFFSET; /* Additional refcnt if the vma is attached. */ if (!detaching) tgt_refcnt++; /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) return false; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, TASK_UNINTERRUPTIBLE); lock_acquired(&vma->vmlock_dep_map, _RET_IP_); return true; } static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) { bool locked; /* * __vma_enter_locked() returns false immediately if the vma is not * attached, otherwise it waits until refcnt is indicating that vma * is attached with no readers. */ locked = __vma_enter_locked(vma, false); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); if (locked) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(detached); /* vma should remain attached */ } } EXPORT_SYMBOL_GPL(__vma_start_write); void vma_mark_detached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_attached(vma); /* * We are the only writer, so no need to use vma_refcount_put(). * The condition below is unlikely because the vma has been already * write-locked and readers can increment vm_refcnt only temporarily * before they check vm_lock_seq, realize the vma is locked and drop * back the vm_refcnt. That is a narrow window for observing a raised * vm_refcnt. */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ if (__vma_enter_locked(vma, true)) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(!detached); } } } /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. * False locked result is possible if mm_lock_seq overflows or if vma gets * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. * * IMPORTANT: RCU lock must be held upon entering the function, but upon error * IT IS RELEASED. The caller must handle this correctly. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { struct mm_struct *other_mm; int oldcnt; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need * ACQUIRE semantics, because this is just a lockless check whose result * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { vma = NULL; goto err; } /* * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. * Acquire fence is required here to avoid reordering against later * vm_lock_seq check and checks inside lock_vma_under_rcu(). */ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, VMA_REF_LIMIT))) { /* return EAGAIN if vma got detached from under us */ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); goto err; } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); if (unlikely(vma->vm_mm != mm)) goto err_unstable; /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are * racing with vma_end_write_all(), we only start reading from the VMA * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); vma = NULL; goto err; } return vma; err: rcu_read_unlock(); return vma; err_unstable: /* * If vma got attached to another mm from under us, that mm is not * stable and can be freed in the narrow window after vma->vm_refcnt * is dropped and before rcuwait_wake_up(mm) is called. Grab it before * releasing vma->vm_refcnt. */ other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ rcu_read_unlock(); mmgrab(other_mm); vma_refcount_put(vma); mmdrop(other_mm); return NULL; } /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the * function returns NULL. */ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; retry: rcu_read_lock(); vma = mas_walk(&mas); if (!vma) { rcu_read_unlock(); goto inval; } vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { /* Check if the VMA got isolated after we found it */ if (PTR_ERR(vma) == -EAGAIN) { count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ mas_set(&mas, address); goto retry; } /* Failed to lock the VMA */ goto inval; } /* * At this point, we have a stable reference to a VMA: The VMA is * locked and we know it hasn't already been isolated. * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ rcu_read_unlock(); /* Check if the vma we locked is the right one. */ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { vma_end_read(vma); goto inval; } return vma; inval: count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, struct vma_iterator *vmi, unsigned long from_addr) { struct vm_area_struct *vma; int ret; ret = mmap_read_lock_killable(mm); if (ret) return ERR_PTR(ret); /* Lookup the vma at the last position again under mmap_read_lock */ vma_iter_set(vmi, from_addr); vma = vma_next(vmi); if (vma) { /* Very unlikely vma->vm_refcnt overflow case */ if (unlikely(!vma_start_read_locked(vma))) vma = ERR_PTR(-EAGAIN); } mmap_read_unlock(mm); return vma; } struct vm_area_struct *lock_next_vma(struct mm_struct *mm, struct vma_iterator *vmi, unsigned long from_addr) { struct vm_area_struct *vma; unsigned int mm_wr_seq; bool mmap_unlocked; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); retry: /* Start mmap_lock speculation in case we need to verify the vma later */ mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); vma = vma_next(vmi); if (!vma) return NULL; vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { /* * Retry immediately if the vma gets detached from under us. * Infinite loop should not happen because the vma we find will * have to be constantly knocked out from under us. */ if (PTR_ERR(vma) == -EAGAIN) { /* reset to search from the last address */ rcu_read_lock(); vma_iter_set(vmi, from_addr); goto retry; } goto fallback; } /* Verify the vma is not behind the last search position. */ if (unlikely(from_addr >= vma->vm_end)) goto fallback_unlock; /* * vma can be ahead of the last search position but we need to verify * it was not shrunk after we found it and another vma has not been * installed ahead of it. Otherwise we might observe a gap that should * not be there. */ if (from_addr < vma->vm_start) { /* Verify only if the address space might have changed since vma lookup. */ if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { vma_iter_set(vmi, from_addr); if (vma != vma_next(vmi)) goto fallback_unlock; } } return vma; fallback_unlock: rcu_read_unlock(); vma_end_read(vma); fallback: vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); rcu_read_lock(); /* Reinitialize the iterator after re-entering rcu read section */ vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); return vma; } #endif /* CONFIG_PER_VMA_LOCK */ #ifdef CONFIG_LOCK_MM_AND_FIND_VMA #include <linux/extable.h> static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { if (likely(mmap_read_trylock(mm))) return true; if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_read_lock_killable(mm); } static inline bool mmap_upgrade_trylock(struct mm_struct *mm) { /* * We don't have this operation yet. * * It should be easy enough to do: it's basically a * atomic_long_try_cmpxchg_acquire() * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but * it also needs the proper lockdep magic etc. */ return false; } static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { mmap_read_unlock(mm); if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_write_lock_killable(mm); } /* * Helper for page fault handling. * * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * * For example, if we have a kernel bug that causes a page * fault, we don't want to just use mmap_read_lock() to get * the mm lock, because that would deadlock if the bug were * to happen while we're holding the mm lock for writing. * * So this checks the exception tables on kernel faults in * order to only do this all for instructions that are actually * expected to fault. * * We can also actually take the mm lock for writing if we * need to extend the vma, which helps the VM layer a lot. */ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long addr, struct pt_regs *regs) { struct vm_area_struct *vma; if (!get_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (likely(vma && (vma->vm_start <= addr))) return vma; /* * Well, dang. We might still be successful, but only * if we can extend a vma to do so. */ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { mmap_read_unlock(mm); return NULL; } /* * We can try to upgrade the mmap lock atomically, * in which case we can continue to use the vma * we already looked up. * * Otherwise we'll have to drop the mmap lock and * re-take it, and also look up the vma again, * re-checking it. */ if (!mmap_upgrade_trylock(mm)) { if (!upgrade_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (!vma) goto fail; if (vma->vm_start <= addr) goto success; if (!(vma->vm_flags & VM_GROWSDOWN)) goto fail; } if (expand_stack_locked(vma, addr)) goto fail; success: mmap_write_downgrade(mm); return vma; fail: mmap_write_unlock(mm); return NULL; } #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ #else /* CONFIG_MMU */ /* * At least xtensa ends up having protection faults even with no * MMU.. No stack expansion, at least. */ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long addr, struct pt_regs *regs) { struct vm_area_struct *vma; mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) mmap_read_unlock(mm); return vma; } #endif /* CONFIG_MMU */
300 127 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ #include <linux/list.h> #include <linux/fsnotify.h> #include <linux/srcu.h> #include <linux/types.h> #include "../mount.h" /* * fsnotify_connp_t is what we embed in objects which connector can be attached * to. */ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { return real_mount(conn->obj); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct mnt_namespace *fsnotify_conn_mntns( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct super_block *fsnotify_object_sb(void *obj, enum fsnotify_obj_type obj_type) { switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: return ((struct inode *)obj)->i_sb; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return ((struct vfsmount *)obj)->mnt_sb; case FSNOTIFY_OBJ_TYPE_SB: return (struct super_block *)obj; default: return NULL; } } static inline struct super_block *fsnotify_connector_sb( struct fsnotify_mark_connector *conn) { return fsnotify_object_sb(conn->obj, conn->type); } static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); return sbinfo ? &sbinfo->sb_marks : NULL; } /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); /* Destroy all marks attached to an object via connector */ extern void fsnotify_destroy_marks(fsnotify_connp_t *connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns) { fsnotify_destroy_marks(&mntns->n_fsnotify_marks); } /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ extern void fsnotify_set_children_dentry_flags(struct inode *inode); extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
46 46 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ #include "bat_v_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" /** * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node * @bat_priv: the bat priv with all the mesh interface information * @addr: the address of the originator * * Return: the orig_node corresponding to the specified address. If such an * object does not exist, it is allocated here. In case of allocation failure * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) { /* remove refcnt for newly created orig_node and hash entry */ batadv_orig_node_put(orig_node); batadv_orig_node_put(orig_node); orig_node = NULL; } return orig_node; } /** * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) { unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ msecs += get_random_u32_below(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } /** * batadv_v_ogm_start_timer() - restart the OGM sending timer * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) { unsigned long msecs; /* this function may be invoked in different contexts (ogm rescheduling * or hard_iface activation), but the work timer should not be reset */ if (delayed_work_pending(&bat_priv->bat_v.ogm_wq)) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } /** * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface * @skb: the OGM to send * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) { kfree_skb(skb); return; } batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } /** * batadv_v_ogm_len() - OGMv2 packet length * @skb: the OGM to check * * Return: Length of the given OGMv2 packet, including tvlv length, excluding * ethernet header length. */ static unsigned int batadv_v_ogm_len(struct sk_buff *skb) { struct batadv_ogm2_packet *ogm_packet; ogm_packet = (struct batadv_ogm2_packet *)skb->data; return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len); } /** * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue * @skb: the OGM to check * @hard_iface: the interface to use to send the OGM * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. * * Return: True, if the given OGMv2 packet still fits, false otherwise. */ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu, BATADV_MAX_AGGREGATION_BYTES); unsigned int ogm_len = batadv_v_ogm_len(skb); lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); return hard_iface->bat_v.aggr_len + ogm_len <= max; } /** * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) { lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); __skb_queue_purge(&hard_iface->bat_v.aggr_list); hard_iface->bat_v.aggr_len = 0; } /** * batadv_v_ogm_aggr_send() - flush & send aggregation queue * @hard_iface: the interface with the aggregation queue to flush * * Aggregates all OGMv2 packets currently in the aggregation queue into a * single OGMv2 packet and transmits this aggregate. * * The aggregation queue is empty after this call. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) { unsigned int aggr_len = hard_iface->bat_v.aggr_len; struct sk_buff *skb_aggr; unsigned int ogm_len; struct sk_buff *skb; lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); if (!aggr_len) return; skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN); if (!skb_aggr) { batadv_v_ogm_aggr_list_free(hard_iface); return; } skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); skb_reset_network_header(skb_aggr); while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); ogm_len = batadv_v_ogm_len(skb); skb_put_data(skb_aggr, skb->data, ogm_len); consume_skb(skb); } batadv_v_ogm_send_to_if(skb_aggr, hard_iface); } /** * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface * @skb: the OGM to queue * @hard_iface: the interface to queue the OGM on */ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (!atomic_read(&bat_priv->aggregated_ogms)) { batadv_v_ogm_send_to_if(skb, hard_iface); return; } spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) batadv_v_ogm_aggr_send(hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_send_meshif() - periodic worker broadcasting the own OGM * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; unsigned char *ogm_buff; struct list_head *iter; int ogm_buff_len; u16 tvlv_len = 0; int ret; lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; ogm_buff = bat_priv->bat_v.ogm_buff; ogm_buff_len = bat_priv->bat_v.ogm_buff_len; /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, &ogm_buff_len, BATADV_OGM2_HLEN); bat_priv->bat_v.ogm_buff = ogm_buff; bat_priv->bat_v.ogm_buff_len = ogm_buff_len; skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); if (!skb) goto reschedule; skb_reserve(skb, ETH_HLEN); skb_put_data(skb, ogm_buff, ogm_buff_len); ogm_packet = (struct batadv_ogm2_packet *)skb->data; ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); atomic_inc(&bat_priv->bat_v.ogm_seqno); ogm_packet->tvlv_len = htons(tvlv_len); /* broadcast on every interface */ rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselves on %s suppressed: %s\n", hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", ogm_packet->orig, ntohl(ogm_packet->seqno), ntohl(ogm_packet->throughput), ogm_packet->ttl, hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); /* this skb gets consumed by batadv_v_ogm_send_to_if() */ skb_tmp = skb_clone(skb, GFP_ATOMIC); if (!skb_tmp) { batadv_hardif_put(hard_iface); break; } batadv_v_ogm_queue_on_if(skb_tmp, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); consume_skb(skb); reschedule: batadv_v_ogm_start_timer(bat_priv); out: return; } /** * batadv_v_ogm_send() - periodic worker broadcasting the own OGM * @work: work queue item */ static void batadv_v_ogm_send(struct work_struct *work) { struct batadv_priv_bat_v *bat_v; struct batadv_priv *bat_priv; bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); bat_priv = container_of(bat_v, struct batadv_priv, bat_v); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); batadv_v_ogm_send_meshif(bat_priv); mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { struct batadv_hard_iface_bat_v *batv; struct batadv_hard_iface *hard_iface; batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_send(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); } /** * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); batadv_v_ogm_start_queue_timer(hard_iface); batadv_v_ogm_start_timer(bat_priv); return 0; } /** * batadv_v_ogm_iface_disable() - release OGM interface private resources * @hard_iface: interface for which the resources have to be released */ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_list_free(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_primary_iface_set() - set a new primary interface * @primary_iface: the new primary interface */ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) { struct batadv_priv *bat_priv = netdev_priv(primary_iface->mesh_iface); struct batadv_ogm2_packet *ogm_packet; mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); if (!bat_priv->bat_v.ogm_buff) goto unlock; ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); unlock: mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_forward_penalty() - apply a penalty to the throughput metric * forwarded with B.A.T.M.A.N. V OGMs * @bat_priv: the bat priv with all the mesh interface information * @if_incoming: the interface where the OGM has been received * @if_outgoing: the interface where the OGM has to be forwarded to * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the * characteristic of the interface where the OGM has been received. * * Initially the per hardif hop penalty is applied to the throughput. After * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, u32 throughput) { int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; /* Apply per hardif hop penalty */ throughput = throughput * (hop_penalty_max - if_hop_penalty) / hop_penalty_max; /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; /* Forwarding on the same WiFi interface cuts the throughput in half * due to the store & forward characteristics of WIFI. * Very low throughput values are the exception. */ if (throughput > 10 && if_incoming == if_outgoing && !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) return throughput / 2; /* hop penalty of 255 equals 100% */ return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max; } /** * batadv_v_ogm_forward() - check conditions and forward an OGM to the given * outgoing interface * @bat_priv: the bat priv with all the mesh interface information * @ogm_received: previously received OGM to be forwarded * @orig_node: the originator which has been updated * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface on which this OGM was received on * @if_outgoing: the interface to which the OGM has to be forwarded to * * Forward an OGM to an interface after having altered the throughput metric and * the TTL value contained in it. The original OGM isn't modified. */ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm_received, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *router = NULL; struct batadv_ogm2_packet *ogm_forward; unsigned char *skb_buff; struct sk_buff *skb; size_t packet_len; u16 tvlv_len; /* only forward for specific interfaces, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; /* acquire possibly updated router */ router = batadv_orig_router_get(orig_node, if_outgoing); /* strict rule: forward packets coming from the best next hop only */ if (neigh_node != router) goto out; /* don't forward the same seqno twice on one interface */ if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno)) goto out; orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno); if (ogm_received->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); goto out; } neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; tvlv_len = ntohs(ogm_received->tvlv_len); packet_len = BATADV_OGM2_HLEN + tvlv_len; skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, ETH_HLEN + packet_len); if (!skb) goto out; skb_reserve(skb, ETH_HLEN); skb_buff = skb_put_data(skb, ogm_received, packet_len); /* apply forward penalty */ ogm_forward = (struct batadv_ogm2_packet *)skb_buff; ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput); ogm_forward->ttl--; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), ogm_forward->ttl, if_incoming->net_dev->name); batadv_v_ogm_queue_on_if(skb, if_outgoing); out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); } /** * batadv_v_ogm_metric_update() - update route metric based on OGM * @bat_priv: the bat priv with all the mesh interface information * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: * 1 if the OGM is new, * 0 if it is not new but valid, * <0 on error (e.g. old OGM) */ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; bool protection_started = false; int ret = -EINVAL; u32 path_throughput; s32 seq_diff; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno; if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_OGM_MAX_AGE, &orig_ifinfo->batman_seqno_reset, &protection_started)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within window protection time from %pM\n", ogm2->orig); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Last reset: %ld, %ld\n", orig_ifinfo->batman_seqno_reset, jiffies); goto out; } /* drop packets with old seqnos, however accept the first packet after * a host has been rebooted. */ if (seq_diff < 0 && !protection_started) goto out; neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno); orig_ifinfo->last_ttl = ogm2->ttl; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming, if_outgoing, ntohl(ogm2->throughput)); neigh_ifinfo->bat_v.throughput = path_throughput; neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno); neigh_ifinfo->last_ttl = ogm2->ttl; if (seq_diff > 0 || protection_started) ret = 1; else ret = 0; out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return ret; } /** * batadv_v_ogm_route_update() - update routes based on OGM * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: true if the packet should be forwarded, false otherwise */ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; u32 router_throughput, neigh_throughput; u32 router_last_seqno; u32 neigh_last_seqno; s32 neigh_seq_diff; bool forward = false; orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router && router->orig_node != orig_node && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } /* Mark the OGM to be considered for forwarding, and update routes * if needed. */ forward = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Searching and updating originator entry of received packet\n"); /* if this neighbor already is our next hop there is nothing * to change */ if (router == neigh_node) goto out; /* don't consider neighbours with worse throughput. * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than * the last received seqno from our best next hop. */ if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); /* if these are not allocated, something is wrong. */ if (!router_ifinfo || !neigh_ifinfo) goto out; neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; router_last_seqno = router_ifinfo->bat_v.last_seqno; neigh_seq_diff = neigh_last_seqno - router_last_seqno; router_throughput = router_ifinfo->bat_v.throughput; neigh_throughput = neigh_ifinfo->bat_v.throughput; if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF && router_throughput >= neigh_throughput) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: batadv_neigh_node_put(router); batadv_neigh_node_put(orig_neigh_router); batadv_orig_node_put(orig_neigh_node); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return forward; } /** * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { int seqno_age; bool forward; /* first, update the metric with according sanity checks */ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* outdated sequence numbers are to be discarded */ if (seqno_age < 0) return; /* only unknown & newer OGMs contain TVLVs we are interested in */ if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node, NULL, (unsigned char *)(ogm2 + 1), ntohs(ogm2->tvlv_len)); /* if the metric update went through, update routes if needed */ forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* if the routes have been processed correctly, check and forward */ if (forward) batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); } /** * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm2_packet: potential OGM2 in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm2_packet *ogm2_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm2_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); return next_buff_pos <= packet_len; } /** * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct ethhdr *ethhdr; struct batadv_orig_node *orig_node = NULL; struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; u32 ogm_throughput, link_throughput, path_throughput; struct list_head *iter; int ret; ethhdr = eth_hdr(skb); ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset); ogm_throughput = ntohl(ogm_packet->throughput); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, throughput %u, TTL %u, V %u, tvlv_len %u)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl, ogm_packet->version, ntohs(ogm_packet->tvlv_len)); if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from ourself\n"); return; } /* If the throughput metric is 0, immediately drop the packet. No need * to create orig_node / neigh_node for an unusable route. */ if (ogm_throughput == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with throughput metric of 0\n"); return; } /* require ELP packets be to received from this neighbor first */ hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (!hardif_neigh) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) goto out; neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming, ethhdr->h_source); if (!neigh_node) goto out; /* Update the received throughput metric to match the link * characteristic: * - If this OGM traveled one hop so far (emitted by single hop * neighbor) the path throughput metric equals the link throughput. * - For OGMs traversing more than hop the path throughput metric is * the smaller of the path throughput and the link throughput. */ link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); path_throughput = min_t(u32, link_throughput, ogm_throughput); ogm_packet->throughput = htonl(path_throughput); batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, ogm_packet->orig, hardif_neigh->orig); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s suppressed: %s\n", ogm_packet->orig, hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); out: batadv_orig_node_put(orig_node); batadv_neigh_node_put(neigh_node); batadv_hardif_neigh_put(hardif_neigh); } /** * batadv_v_ogm_packet_recv() - OGM2 receiving handler * @skb: the received OGM * @if_incoming: the interface where this OGM has been received * * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP * (without freeing the skb) on failure */ int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_ogm2_packet *ogm_packet; struct ethhdr *ethhdr; int ogm_offset; u8 *packet_pos; int ret = NET_RX_DROP; /* did we receive a OGM2 packet on an interface that does not have * B.A.T.M.A.N. V enabled ? */ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) goto free_skb; if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) goto free_skb; ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm2_packet *)skb->data; while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_v_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM2_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm2_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_v_ogm_init() - initialise the OGM2 engine * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or a negative error code in case of failure */ int batadv_v_ogm_init(struct batadv_priv *bat_priv) { struct batadv_ogm2_packet *ogm_packet; unsigned char *ogm_buff; u32 random_seqno; bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN; ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) return -ENOMEM; bat_priv->bat_v.ogm_buff = ogm_buff; ogm_packet = (struct batadv_ogm2_packet *)ogm_buff; ogm_packet->packet_type = BATADV_OGM2; ogm_packet->version = BATADV_COMPAT_VERSION; ogm_packet->ttl = BATADV_TTL; ogm_packet->flags = BATADV_NO_FLAGS; ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); mutex_init(&bat_priv->bat_v.ogm_buff_mutex); return 0; } /** * batadv_v_ogm_free() - free OGM private resources * @bat_priv: the bat priv with all the mesh interface information */ void batadv_v_ogm_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); kfree(bat_priv->bat_v.ogm_buff); bat_priv->bat_v.ogm_buff = NULL; bat_priv->bat_v.ogm_buff_len = 0; mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); }
533 198 525 245 198 236 127 228 230 3 228 232 45 193 51 155 39 40 20 40 53 52 1 1 38 38 15 53 522 573 405 249 1 211 21 54 209 53 85 225 225 29 198 276 30 181 2 178 275 276 215 249 276 80 198 229 40 1 584 430 1 274 451 29 41 245 5 245 523 1 523 275 13 457 15 457 250 523 304 245 108 97 59 124 20 20 1 19 12 2 11 9 9 20 9 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 // SPDX-License-Identifier: GPL-2.0-only /* * vfsv0 quota IO operations on file */ #include <linux/errno.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/dqblk_v2.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/quotaops.h> #include <asm/byteorder.h> #include "quota_tree.h" MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota trie support"); MODULE_LICENSE("GPL"); /* * Maximum quota tree depth we support. Only to limit recursion when working * with the tree. */ #define MAX_QTREE_DEPTH 6 #define __QUOTA_QT_PARANOIA static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) { unsigned int epb = info->dqi_usable_bs >> 2; depth = info->dqi_qtree_depth - depth - 1; while (depth--) id /= epb; return id % epb; } static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth) { qid_t id = from_kqid(&init_user_ns, qid); return __get_index(info, id, depth); } /* Number of entries in one blocks */ static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) { return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) / info->dqi_entry_size; } static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; memset(buf, 0, info->dqi_usable_bs); return sb->s_op->quota_read(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); } static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; ssize_t ret; ret = sb->s_op->quota_write(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -EIO; } return ret; } static inline int do_check_range(struct super_block *sb, const char *val_name, uint val, uint min_val, uint max_val) { if (val < min_val || val > max_val) { quota_error(sb, "Getting %s %u out of range %u-%u", val_name, val, min_val, max_val); return -EUCLEAN; } return 0; } static int check_dquot_block_header(struct qtree_mem_dqinfo *info, struct qt_disk_dqdbheader *dh) { int err = 0; err = do_check_range(info->dqi_sb, "dqdh_next_free", le32_to_cpu(dh->dqdh_next_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_prev_free", le32_to_cpu(dh->dqdh_prev_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_entries", le16_to_cpu(dh->dqdh_entries), 0, qtree_dqstr_in_blk(info)); return err; } /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int ret, blk; if (!buf) return -ENOMEM; if (info->dqi_free_blk) { blk = info->dqi_free_blk; ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { memset(buf, 0, info->dqi_usable_bs); /* Assure block allocation... */ ret = write_blk(info, info->dqi_blocks, buf); if (ret < 0) goto out_buf; blk = info->dqi_blocks++; } mark_info_dirty(info->dqi_sb, info->dqi_type); ret = blk; out_buf: kfree(buf); return ret; } /* Insert empty block to the list */ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) { struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); dh->dqdh_prev_free = cpu_to_le32(0); dh->dqdh_entries = cpu_to_le16(0); err = write_blk(info, blk, buf); if (err < 0) return err; info->dqi_free_blk = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; } /* Remove given block from the list of blocks with free entries */ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; uint nextblk = le32_to_cpu(dh->dqdh_next_free); uint prevblk = le32_to_cpu(dh->dqdh_prev_free); int err; if (!tmpbuf) return -ENOMEM; if (nextblk) { err = read_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; err = write_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; } if (prevblk) { err = read_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; err = write_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; } else { info->dqi_free_entry = nextblk; mark_info_dirty(info->dqi_sb, info->dqi_type); } kfree(tmpbuf); dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); /* No matter whether write succeeds block is out of list */ if (write_blk(info, blk, buf) < 0) quota_error(info->dqi_sb, "Can't write block (%u) " "with free entries", blk); return 0; out_buf: kfree(tmpbuf); return err; } /* Insert given block to the beginning of list with free entries */ static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; if (!tmpbuf) return -ENOMEM; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); dh->dqdh_prev_free = cpu_to_le32(0); err = write_blk(info, blk, buf); if (err < 0) goto out_buf; if (info->dqi_free_entry) { err = read_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); err = write_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; } kfree(tmpbuf); info->dqi_free_entry = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; out_buf: kfree(tmpbuf); return err; } /* Is the entry in the block free? */ int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) { int i; for (i = 0; i < info->dqi_entry_size; i++) if (disk[i]) return 0; return 1; } EXPORT_SYMBOL(qtree_entry_unused); /* Find space for dquot */ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, int *err) { uint blk, i; struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); char *ddquot; *err = 0; if (!buf) { *err = -ENOMEM; return 0; } dh = (struct qt_disk_dqdbheader *)buf; if (info->dqi_free_entry) { blk = info->dqi_free_entry; *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; *err = check_dquot_block_header(info, dh); if (*err) goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { *err = blk; kfree(buf); return 0; } memset(buf, 0, info->dqi_usable_bs); /* This is enough as the block is already zeroed and the entry * list is empty... */ info->dqi_free_entry = blk; mark_info_dirty(dquot->dq_sb, dquot->dq_id.type); } /* Block will be full? */ if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { *err = remove_free_dqentry(info, buf, blk); if (*err < 0) { quota_error(dquot->dq_sb, "Can't remove block (%u) " "from entry free list", blk); goto out_buf; } } le16_add_cpu(&dh->dqdh_entries, 1); /* Find free structure in block */ ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (qtree_entry_unused(info, ddquot)) break; ddquot += info->dqi_entry_size; } #ifdef __QUOTA_QT_PARANOIA if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Data block full but it shouldn't"); *err = -EIO; goto out_buf; } #endif *err = write_blk(info, blk, buf); if (*err < 0) { quota_error(dquot->dq_sb, "Can't write quota data block %u", blk); goto out_buf; } dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; kfree(buf); return blk; out_buf: kfree(buf); return 0; } /* Insert reference to structure into the trie */ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0, newson = 0, newact = 0; __le32 *ref; uint newblk; int i; if (!buf) return -ENOMEM; if (!blks[depth]) { ret = get_free_dqblk(info); if (ret < 0) goto out_buf; for (i = 0; i < depth; i++) if (ret == blks[i]) { quota_error(dquot->dq_sb, "Free block already used in tree: block %u", ret); ret = -EIO; goto out_buf; } blks[depth] = ret; memset(buf, 0, info->dqi_usable_bs); newact = 1; } else { ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read tree quota " "block %u", blks[depth]); goto out_buf; } } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (!newblk) { newson = 1; } else { for (i = 0; i <= depth; i++) if (newblk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } } blks[depth + 1] = newblk; if (depth == info->dqi_qtree_depth - 1) { #ifdef __QUOTA_QT_PARANOIA if (newblk) { quota_error(dquot->dq_sb, "Inserting already present " "quota entry (block %u)", le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)])); ret = -EIO; goto out_buf; } #endif blks[depth + 1] = find_free_dqentry(info, dquot, &ret); } else { ret = do_insert_tree(info, dquot, blks, depth + 1); } if (newson && ret >= 0) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(blks[depth + 1]); ret = write_blk(info, blks[depth], buf); } else if (newact && ret < 0) { put_free_dqblk(info, buf, blks[depth]); } out_buf: kfree(buf); return ret; } /* Wrapper for inserting quota structure into tree */ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; #ifdef __QUOTA_QT_PARANOIA if (info->dqi_blocks <= QT_TREEOFF) { quota_error(dquot->dq_sb, "Quota tree root isn't allocated!"); return -EIO; } #endif if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return do_insert_tree(info, dquot, blks, 0); } /* * We don't have to be afraid of deadlocks as we never have quotas on quota * files... */ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; ssize_t ret; char *ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL); if (!ddquot) return -ENOMEM; /* dq_off is guarded by dqio_sem */ if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { quota_error(sb, "Error %zd occurred while creating " "quota", ret); kfree(ddquot); return ret; } } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->mem2disk_dqblk(ddquot, dquot); spin_unlock(&dquot->dq_dqb_lock); ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -ENOSPC; } else { ret = 0; } dqstats_inc(DQST_WRITES); kfree(ddquot); return ret; } EXPORT_SYMBOL(qtree_write_dquot); /* Free dquot entry in data block */ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0; if (!buf) return -ENOMEM; if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", blk); goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk); if (ret >= 0) ret = put_free_dqblk(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't move quota data block " "(%u) to free list", blk); goto out_buf; } } else { memset(buf + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), 0, info->dqi_entry_size); if (le16_to_cpu(dh->dqdh_entries) == qtree_dqstr_in_blk(info) - 1) { /* Insert will write block itself */ ret = insert_free_dqentry(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't insert quota " "data block (%u) to free entry list", blk); goto out_buf; } } else { ret = write_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't write quota " "data block %u", blk); goto out_buf; } } } dquot->dq_off = 0; /* Quota is now unattached */ out_buf: kfree(buf); return ret; } /* Remove reference to dquot from tree */ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0; uint newblk; __le32 *ref = (__le32 *)buf; int i; if (!buf) return -ENOMEM; ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", blks[depth]); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; for (i = 0; i <= depth; i++) if (newblk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); blks[depth + 1] = 0; } else { blks[depth + 1] = newblk; ret = remove_tree(info, dquot, blks, depth + 1); } if (ret >= 0 && !blks[depth + 1]) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); /* Block got empty? */ for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++) ; /* Don't put the root block into the free block list */ if (i == (info->dqi_usable_bs >> 2) && blks[depth] != QT_TREEOFF) { put_free_dqblk(info, buf, blks[depth]); blks[depth] = 0; } else { ret = write_blk(info, blks[depth], buf); if (ret < 0) quota_error(dquot->dq_sb, "Can't write quota tree block %u", blks[depth]); } } out_buf: kfree(buf); return ret; } /* Delete dquot from tree */ int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; if (!dquot->dq_off) /* Even not allocated? */ return 0; if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return remove_tree(info, dquot, blks, 0); } EXPORT_SYMBOL(qtree_delete_dquot); /* Find entry in block */ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); loff_t ret = 0; int i; char *ddquot; if (!buf) return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree " "block %u", blk); goto out_buf; } ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (info->dqi_ops->is_id(ddquot, dquot)) break; ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Quota for id %u referenced but not present", from_kqid(&init_user_ns, dquot->dq_id)); ret = -EIO; goto out_buf; } else { ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; } out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree */ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); loff_t ret = 0; __le32 *ref = (__le32 *)buf; uint blk; int i; if (!buf) return -ENOMEM; ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree block %u", blks[depth]); goto out_buf; } ret = 0; blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; /* Check for cycles in the tree */ for (i = 0; i <= depth; i++) if (blk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } blks[depth + 1] = blk; if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blks, depth + 1); else ret = find_block_dqentry(info, dquot, blk); out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree - wrapper function */ static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return find_tree_dqentry(info, dquot, blks, 0); } int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; loff_t offset; char *ddquot; int ret = 0; #ifdef __QUOTA_QT_PARANOIA /* Invalidated quota? */ if (!sb_dqopt(dquot->dq_sb)->files[type]) { quota_error(sb, "Quota invalidated while reading!"); return -EIO; } #endif /* Do we know offset of the dquot entry in the quota file? */ if (!dquot->dq_off) { offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) quota_error(sb,"Can't read quota structure " "for id %u", from_kqid(&init_user_ns, dquot->dq_id)); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); ret = offset; goto out; } dquot->dq_off = offset; } ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL); if (!ddquot) return -ENOMEM; ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { if (ret >= 0) ret = -EIO; quota_error(sb, "Error while reading quota structure for id %u", from_kqid(&init_user_ns, dquot->dq_id)); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); kfree(ddquot); goto out; } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->disk2mem_dqblk(dquot, ddquot); if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); kfree(ddquot); out: dqstats_inc(DQST_READS); return ret; } EXPORT_SYMBOL(qtree_read_dquot); /* Check whether dquot should not be deleted. We know we are * the only one operating on dquot (thanks to dq_lock) */ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) return qtree_delete_dquot(info, dquot); return 0; } EXPORT_SYMBOL(qtree_release_dquot); static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, unsigned int blk, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); __le32 *ref = (__le32 *)buf; ssize_t ret; unsigned int epb = info->dqi_usable_bs >> 2; unsigned int level_inc = 1; int i; if (!buf) return -ENOMEM; for (i = depth; i < info->dqi_qtree_depth - 1; i++) level_inc *= epb; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(info->dqi_sb, "Can't read quota tree block %u", blk); goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { uint blk_no = le32_to_cpu(ref[i]); if (blk_no == 0) { *id += level_inc; continue; } ret = do_check_range(info->dqi_sb, "block", blk_no, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = 0; goto out_buf; } ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; } if (i == epb) { ret = -ENOENT; goto out_buf; } out_buf: kfree(buf); return ret; } int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid) { qid_t id = from_kqid(&init_user_ns, *qid); int ret; ret = find_next_id(info, &id, QT_TREEOFF, 0); if (ret < 0) return ret; *qid = make_kqid(&init_user_ns, qid->type, id); return 0; } EXPORT_SYMBOL(qtree_get_next_id);
63 1 62 11 11 55 19 1 39 39 35 10 25 25 4437 4433 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 /* Copyright (c) 2018, Mellanox Technologies All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <crypto/aead.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/netdevice.h> #include <net/dst.h> #include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/tls.h> #include <linux/skbuff_ref.h> #include "tls.h" #include "trace.h" /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ static DECLARE_RWSEM(device_offload_lock); static struct workqueue_struct *destruct_wq __read_mostly; static LIST_HEAD(tls_device_list); static LIST_HEAD(tls_device_down_list); static DEFINE_SPINLOCK(tls_device_lock); static struct page *dummy_page; static void tls_device_free_ctx(struct tls_context *ctx) { if (ctx->tx_conf == TLS_HW) kfree(tls_offload_ctx_tx(ctx)); if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); tls_ctx_free(NULL, ctx); } static void tls_device_tx_del_task(struct work_struct *work) { struct tls_offload_context_tx *offload_ctx = container_of(work, struct tls_offload_context_tx, destruct_work); struct tls_context *ctx = offload_ctx->ctx; struct net_device *netdev; /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); ctx->netdev = NULL; tls_device_free_ctx(ctx); } static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { struct net_device *netdev; unsigned long flags; bool async_cleanup; spin_lock_irqsave(&tls_device_lock, flags); if (unlikely(!refcount_dec_and_test(&ctx->refcount))) { spin_unlock_irqrestore(&tls_device_lock, flags); return; } list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */ /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); async_cleanup = netdev && ctx->tx_conf == TLS_HW; if (async_cleanup) { struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx); /* queue_work inside the spinlock * to make sure tls_device_down waits for that work. */ queue_work(destruct_wq, &offload_ctx->destruct_work); } spin_unlock_irqrestore(&tls_device_lock, flags); if (!async_cleanup) tls_device_free_ctx(ctx); } /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { struct net_device *dev, *lowest_dev = NULL; struct dst_entry *dst; rcu_read_lock(); dst = __sk_dst_get(sk); dev = dst ? dst_dev_rcu(dst) : NULL; if (likely(dev)) { lowest_dev = netdev_sk_get_lowest_dev(dev, sk); dev_hold(lowest_dev); } rcu_read_unlock(); return lowest_dev; } static void destroy_record(struct tls_record_info *record) { int i; for (i = 0; i < record->num_frags; i++) __skb_frag_unref(&record->frags[i], false); kfree(record); } static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) { list_del(&info->list); destroy_record(info); } offload_ctx->retransmit_hint = NULL; } static void tls_tcp_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; if (info && !before(acked_seq, info->end_seq)) ctx->retransmit_hint = NULL; list_for_each_entry_safe(info, temp, &ctx->records_list, list) { if (before(acked_seq, info->end_seq)) break; list_del(&info->list); destroy_record(info); deleted_records++; } ctx->unacked_record_sn += deleted_records; spin_unlock_irqrestore(&ctx->lock, flags); } /* At this point, there should be no references on this * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); tls_ctx->sk_destruct(sk); if (tls_ctx->tx_conf == TLS_HW) { if (ctx->open_record) destroy_record(ctx->open_record); delete_all_records(ctx); crypto_free_aead(ctx->aead_send); clean_acked_data_disable(tcp_sk(sk)); } tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_free_partial_record(sk, tls_ctx); } void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); } EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { struct net_device *netdev; int err = 0; u8 *rcd_sn; tcp_write_collapse_fence(sk); rcd_sn = tls_ctx->tx.rec_seq; trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (netdev) err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_TX); up_read(&device_offload_lock); if (err) return; clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); } static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) { skb_frag_t *frag; frag = &record->frags[record->num_frags - 1]; if (skb_frag_page(frag) == pfrag->page && skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) { skb_frag_size_add(frag, size); } else { ++frag; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, size); ++record->num_frags; get_page(pfrag->page); } pfrag->offset += size; record->len += size; } static int tls_push_record(struct sock *sk, struct tls_context *ctx, struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, int flags) { struct tls_prot_info *prot = &ctx->prot_info; struct tcp_sock *tp = tcp_sk(sk); skb_frag_t *frag; int i; record->end_seq = tp->write_seq + record->len; list_add_tail_rcu(&record->list, &offload_ctx->records_list); offload_ctx->open_record = NULL; if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) tls_device_resync_tx(sk, ctx, tp->write_seq); tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), skb_frag_size(frag), skb_frag_off(frag)); sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); /* all ready, send */ return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } static void tls_device_record_close(struct sock *sk, struct tls_context *ctx, struct tls_record_info *record, struct page_frag *pfrag, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; struct page_frag dummy_tag_frag; /* append tag * device will fill in the tag, we just need to append a placeholder * use socket memory to improve coalescing (re-using a single buffer * increases frag count) * if we can't allocate memory now use the dummy page */ if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { dummy_tag_frag.page = dummy_page; dummy_tag_frag.offset = 0; pfrag = &dummy_tag_frag; } tls_append_frag(record, pfrag, prot->tag_size); /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, record_type); } static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { struct tls_record_info *record; skb_frag_t *frag; record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) return -ENOMEM; frag = &record->frags[0]; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, prepend_size); get_page(pfrag->page); pfrag->offset += prepend_size; record->num_frags = 1; record->len = prepend_size; offload_ctx->open_record = record; return 0; } static int tls_do_allocation(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { int ret; if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } ret = tls_create_new_record(offload_ctx, pfrag, prepend_size); if (ret) return ret; if (pfrag->size > pfrag->offset) return 0; } if (!sk_page_frag_refill(sk, pfrag)) return -ENOMEM; return 0; } static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) { size_t pre_copy, nocache; pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1); if (pre_copy) { pre_copy = min(pre_copy, bytes); if (copy_from_iter(addr, pre_copy, i) != pre_copy) return -EFAULT; bytes -= pre_copy; addr += pre_copy; } nocache = round_down(bytes, SMP_CACHE_BYTES); if (copy_from_iter_nocache(addr, nocache, i) != nocache) return -EFAULT; bytes -= nocache; addr += nocache; if (bytes && copy_from_iter(addr, bytes, i) != bytes) return -EFAULT; return 0; } static int tls_push_data(struct sock *sk, struct iov_iter *iter, size_t size, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); struct tls_record_info *record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; bool more = false; bool done = false; int copy, rc = 0; long timeo; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) return -EINVAL; if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; tls_push_record_flags = flags | MSG_MORE; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); if (rc < 0) return rc; } pfrag = sk_page_frag(sk); /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; record = ctx->open_record; if (!record) break; handle_error: if (record_type != TLS_RECORD_TYPE_DATA) { /* avoid sending partial * record with type != * application_data */ size = orig_size; destroy_record(record); ctx->open_record = NULL; } else if (record->len > prot->prepend_size) { goto last_record; } break; } record = ctx->open_record; copy = min_t(size_t, size, max_open_record_len - record->len); if (copy && (flags & MSG_SPLICE_PAGES)) { struct page_frag zc_pfrag; struct page **pages = &zc_pfrag.page; size_t off; rc = iov_iter_extract_pages(iter, &pages, copy, 1, 0, &off); if (rc <= 0) { if (rc == 0) rc = -EIO; goto handle_error; } copy = rc; if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) { iov_iter_revert(iter, copy); rc = -EIO; goto handle_error; } zc_pfrag.offset = off; zc_pfrag.size = copy; tls_append_frag(record, &zc_pfrag, copy); } else if (copy) { copy = min_t(size_t, copy, pfrag->size - pfrag->offset); rc = tls_device_copy_data(page_address(pfrag->page) + pfrag->offset, copy, iter); if (rc) goto handle_error; tls_append_frag(record, pfrag, copy); } size -= copy; if (!size) { last_record: tls_push_record_flags = flags; if (flags & MSG_MORE) { more = true; break; } done = true; } if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { tls_device_record_close(sk, tls_ctx, record, pfrag, record_type); rc = tls_push_record(sk, tls_ctx, ctx, record, tls_push_record_flags); if (rc < 0) break; } } while (!done); tls_ctx->pending_open_record_frags = more; if (orig_size - size > 0) rc = orig_size - size; return rc; } int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; if (!tls_ctx->zerocopy_sendfile) msg->msg_flags &= ~MSG_SPLICE_PAGES; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { rc = tls_process_cmsg(sk, msg, &record_type); if (rc) goto out; } rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags, record_type); out: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); return rc; } void tls_device_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter iter = {}; if (!tls_is_partially_sent_record(tls_ctx)) return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (tls_is_partially_sent_record(tls_ctx)) { iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0); tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA); } release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || before(seq, info->end_seq - info->len)) { /* if retransmit_hint is irrelevant start * from the beginning of the list */ info = list_first_entry_or_null(&context->records_list, struct tls_record_info, list); if (!info) return NULL; /* send the start_marker record if seq number is before the * tls offload start marker sequence number. This record is * required to handle TCP packets which are before TLS offload * started. * And if it's not start marker, look if this seq number * belongs to the list. */ if (likely(!tls_record_is_start_marker(info))) { /* we have the first record, get the last record to see * if this seq number belongs to the list. */ last = list_last_entry(&context->records_list, struct tls_record_info, list); if (!between(seq, tls_record_start_seq(info), last->end_seq)) return NULL; } record_sn = context->unacked_record_sn; } /* We just need the _rcu for the READ_ONCE() */ rcu_read_lock(); list_for_each_entry_from_rcu(info, &context->records_list, list) { if (before(seq, info->end_seq)) { if (!context->retransmit_hint || after(info->end_seq, context->retransmit_hint->end_seq)) { context->hint_record_sn = record_sn; context->retransmit_hint = info; } *p_record_sn = record_sn; goto exit_rcu_unlock; } record_sn++; } info = NULL; exit_rcu_unlock: rcu_read_unlock(); return info; } EXPORT_SYMBOL(tls_get_record); static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0); return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { if (tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; WARN_ON_ONCE(sk->sk_write_pending); sk->sk_allocation = GFP_ATOMIC; tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); rcu_read_lock(); netdev = rcu_dereference(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); rcu_read_unlock(); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); u16 i; *rcd_delta = 0; if (is_async) { /* shouldn't get to wraparound: * too long in async stage, something bad happened */ if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) { tls_offload_rx_resync_async_request_cancel(resync_async); return false; } /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ if (before(*seq, req_seq)) return false; if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; resync_async->rcd_delta++; return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ for (i = 0; i < resync_async->loglen; i++) if (req_seq == resync_async->log[i] && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; resync_async->loglen = 0; resync_async->rcd_delta = 0; return true; } resync_async->loglen = 0; resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) return true; return false; } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) return; if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) return; prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); switch (rx_ctx->resync_type) { case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; is_req_pending = resync_req; if (likely(!is_req_pending) || req_seq != seq || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: if (likely(!rx_ctx->resync_nh_do_now)) return; /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ sock_data = tcp_inq(sk); if (sock_data > rcd_len) { trace_tls_device_rx_resync_nh_delay(sk, sock_data, rcd_len); return; } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; tls_bigint_increment(rcd_sn, prot->rec_seq_size); break; case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC: resync_req = atomic64_read(&rx_ctx->resync_async->req); is_req_pending = resync_req; if (likely(!is_req_pending)) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, resync_req, &seq, &rcd_delta)) return; tls_bigint_subtract(rcd_sn, rcd_delta); break; } tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); } static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, struct tls_offload_context_rx *ctx, struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm; /* device will request resyncs by itself based on stream scan */ if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) return; /* already scheduled */ if (ctx->resync_nh_do_now) return; /* seen decrypted fragments since last fully-failed record */ if (ctx->resync_nh_reset) { ctx->resync_nh_reset = 0; ctx->resync_nh.decrypted_failed = 1; ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; return; } if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) return; /* doing resync, bump the next target in case it fails */ if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) ctx->resync_nh.decrypted_tgt *= 2; else ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; rxm = strp_msg(skb); /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); tls_bigint_increment(rcd_sn, prot->rec_seq_size); tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, rcd_sn); } } static int tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); const struct tls_cipher_desc *cipher_desc; int err, offset, copy, data_len, pos; struct sk_buff *skb, *skb_iter; struct scatterlist sg[1]; struct strp_msg *rxm; char *orig_buf, *buf; cipher_desc = get_cipher_desc(tls_ctx->crypto_recv.info.cipher_type); DEBUG_NET_WARN_ON_ONCE(!cipher_desc || !cipher_desc->offloadable); rxm = strp_msg(tls_strp_msg(sw_ctx)); orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv, sk->sk_allocation); if (!orig_buf) return -ENOMEM; buf = orig_buf; err = tls_strp_msg_cow(sw_ctx); if (unlikely(err)) goto free_buf; skb = tls_strp_msg(sw_ctx); rxm = strp_msg(skb); offset = rxm->offset; sg_init_table(sg, 1); sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv); err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_desc->iv); if (err) goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, sg); if (err != -EBADMSG) goto free_buf; else err = 0; data_len = rxm->full_len - cipher_desc->tag; if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); if (skb->decrypted) { err = skb_store_bits(skb, offset, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; } pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { int frag_pos; /* Practically all frags must belong to msg if reencrypt * is needed with current strparser and coalescing logic, * but strparser may "get optimized", so let's be safe. */ if (pos + skb_iter->len <= offset) goto done_with_frag; if (pos >= data_len + rxm->offset) break; frag_pos = offset - pos; copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); if (skb_iter->decrypted) { err = skb_store_bits(skb_iter, frag_pos, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; done_with_frag: pos += skb_iter->len; } free_buf: kfree(orig_buf); return err; } int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb = tls_strp_msg(sw_ctx); struct strp_msg *rxm = strp_msg(skb); int is_decrypted, is_encrypted; if (!tls_strp_msg_mixed_decrypted(sw_ctx)) { is_decrypted = skb->decrypted; is_encrypted = !is_decrypted; } else { is_decrypted = 0; is_encrypted = 0; } trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, tls_ctx->rx.rec_seq, rxm->full_len, is_encrypted, is_decrypted); if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) { if (likely(is_encrypted || is_decrypted)) return is_decrypted; /* After tls_device_down disables the offload, the next SKB will * likely have initial fragments decrypted, and final ones not * decrypted. We need to reencrypt that single SKB. */ return tls_device_reencrypt(sk, tls_ctx); } /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ if (is_decrypted) { ctx->resync_nh_reset = 1; return is_decrypted; } if (is_encrypted) { tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); return 0; } ctx->resync_nh_reset = 1; return tls_device_reencrypt(sk, tls_ctx); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, struct net_device *netdev) { if (sk->sk_destruct != tls_device_sk_destruct) { refcount_set(&ctx->refcount, 1); dev_hold(netdev); RCU_INIT_POINTER(ctx->netdev, netdev); spin_lock_irq(&tls_device_lock); list_add_tail(&ctx->list, &tls_device_list); spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } static struct tls_offload_context_tx *alloc_offload_ctx_tx(struct tls_context *ctx) { struct tls_offload_context_tx *offload_ctx; __be64 rcd_sn; offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL); if (!offload_ctx) return NULL; INIT_WORK(&offload_ctx->destruct_work, tls_device_tx_del_task); INIT_LIST_HEAD(&offload_ctx->records_list); spin_lock_init(&offload_ctx->lock); sg_init_table(offload_ctx->sg_tx_data, ARRAY_SIZE(offload_ctx->sg_tx_data)); /* start at rec_seq - 1 to account for the start marker record */ memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn)); offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1; offload_ctx->ctx = ctx; return offload_ctx; } int tls_set_device_offload(struct sock *sk) { struct tls_record_info *start_marker_record; struct tls_offload_context_tx *offload_ctx; const struct tls_cipher_desc *cipher_desc; struct tls_crypto_info *crypto_info; struct tls_prot_info *prot; struct net_device *netdev; struct tls_context *ctx; char *iv, *rec_seq; int rc; ctx = tls_get_ctx(sk); prot = &ctx->prot_info; if (ctx->priv_ctx_tx) return -EEXIST; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_TX)) { rc = -EOPNOTSUPP; goto release_netdev; } crypto_info = &ctx->crypto_send.info; if (crypto_info->version != TLS_1_2_VERSION) { rc = -EOPNOTSUPP; goto release_netdev; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); if (!cipher_desc || !cipher_desc->offloadable) { rc = -EINVAL; goto release_netdev; } rc = init_prot_info(prot, crypto_info, cipher_desc); if (rc) goto release_netdev; iv = crypto_info_iv(crypto_info, cipher_desc); rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv); memcpy(ctx->tx.rec_seq, rec_seq, cipher_desc->rec_seq); start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); if (!start_marker_record) { rc = -ENOMEM; goto release_netdev; } offload_ctx = alloc_offload_ctx_tx(ctx); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; } rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); if (rc) goto free_offload_ctx; start_marker_record->end_seq = tcp_sk(sk)->write_seq; start_marker_record->len = 0; start_marker_record->num_frags = 0; list_add_tail(&start_marker_record->list, &offload_ctx->records_list); clean_acked_data_enable(tcp_sk(sk), &tls_tcp_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. * So mark the last skb in the write queue as end of record. */ tcp_write_collapse_fence(sk); /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); /* following this assignment tls_is_skb_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); dev_put(netdev); return 0; release_lock: up_read(&device_offload_lock); clean_acked_data_disable(tcp_sk(sk)); crypto_free_aead(offload_ctx->aead_send); free_offload_ctx: kfree(offload_ctx); ctx->priv_ctx_tx = NULL; free_marker_record: kfree(start_marker_record); release_netdev: dev_put(netdev); return rc; } int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) return -EOPNOTSUPP; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_RX)) { rc = -EOPNOTSUPP; goto release_netdev; } /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) { rc = -ENOMEM; goto release_lock; } context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, 0, NULL); if (rc) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); info = (void *)&ctx->crypto_recv.info; trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); dev_put(netdev); return 0; free_sw_resources: up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_lock: up_read(&device_offload_lock); release_netdev: dev_put(netdev); return rc; } void tls_device_offload_cleanup_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct net_device *netdev; down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (!netdev) goto out; netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); rcu_assign_pointer(tls_ctx->netdev, NULL); } else { set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); tls_sw_release_resources_rx(sk); } static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; unsigned long flags; LIST_HEAD(list); /* Request a write lock to block new offload attempts */ down_write(&device_offload_lock); spin_lock_irqsave(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { struct net_device *ctx_netdev = rcu_dereference_protected(ctx->netdev, lockdep_is_held(&device_offload_lock)); if (ctx_netdev != netdev || !refcount_inc_not_zero(&ctx->refcount)) continue; list_move(&ctx->list, &list); } spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { /* Stop offloaded TX and switch to the fallback. * tls_is_skb_tx_device_offloaded will return false. */ WRITE_ONCE(ctx->sk->sk_validate_xmit_skb, tls_validate_xmit_skb_sw); /* Stop the RX and TX resync. * tls_dev_resync must not be called after tls_dev_del. */ rcu_assign_pointer(ctx->netdev, NULL); /* Start skipping the RX resync logic completely. */ set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags); /* Sync with inflight packets. After this point: * TX: no non-encrypted packets will be passed to the driver. * RX: resync requests from the driver will be ignored. */ synchronize_net(); /* Release the offload context on the driver side. */ if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); if (ctx->rx_conf == TLS_HW && !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); dev_put(netdev); /* Move the context to a separate list for two reasons: * 1. When the context is deallocated, list_del is called. * 2. It's no longer an offloaded context, so we don't want to * run offload-specific code on this context. */ spin_lock_irqsave(&tls_device_lock, flags); list_move_tail(&ctx->list, &tls_device_down_list); spin_unlock_irqrestore(&tls_device_lock, flags); /* Device contexts for RX and TX will be freed in on sk_destruct * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. * Now release the ref taken above. */ if (refcount_dec_and_test(&ctx->refcount)) { /* sk_destruct ran after tls_device_down took a ref, and * it returned early. Complete the destruction here. */ list_del(&ctx->list); tls_device_free_ctx(ctx); } } up_write(&device_offload_lock); flush_workqueue(destruct_wq); return NOTIFY_DONE; } static int tls_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!dev->tlsdev_ops && !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if (netif_is_bond_master(dev)) return NOTIFY_DONE; if ((dev->features & NETIF_F_HW_TLS_RX) && !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) return NOTIFY_DONE; else return NOTIFY_BAD; case NETDEV_DOWN: return tls_device_down(dev); } return NOTIFY_DONE; } static struct notifier_block tls_dev_notifier = { .notifier_call = tls_dev_event, }; int __init tls_device_init(void) { int err; dummy_page = alloc_page(GFP_KERNEL); if (!dummy_page) return -ENOMEM; destruct_wq = alloc_workqueue("ktls_device_destruct", WQ_PERCPU, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; } err = register_netdevice_notifier(&tls_dev_notifier); if (err) goto err_destroy_wq; return 0; err_destroy_wq: destroy_workqueue(destruct_wq); err_free_dummy: put_page(dummy_page); return err; } void __exit tls_device_cleanup(void) { unregister_netdevice_notifier(&tls_dev_notifier); destroy_workqueue(destruct_wq); clean_acked_data_flush(); put_page(dummy_page); }
57 5 56 36 10 32 49 49 1 25153 25153 63 6186 44 6178 6186 2 2 2573 438 65 7 43 62 36 48 62 62 32 52 57 57 57 378 375 377 378 379 19739 19741 19746 19740 37 19745 8960 8964 8968 8959 8965 25 352 350 351 350 165 450 445 444 447 4 450 826 827 5448 2 2 2 2 21 21 16 10 17 36 3 35 32 16 1 1 1 1 21 8 22 18 20 32 32 32 1 32 1 3 10 10 32 32 10 32 32 32 10 5 19561 19566 5 19551 15292 15284 34 15161 3321 3 3 3 3 36 36 36 4 52 51 52 7 52 51 52 1 7 4 41 9 17 1 1 11 28 25 28 63 18 4 32 16 43 19 31 53 63 63 4 19 32 51 19 35 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 // SPDX-License-Identifier: GPL-2.0 /* * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kobject.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> #include <linux/audit.h> #include <linux/random.h> #include <vdso/auxclock.h> #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) #define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ TK_ADV_TICK, /* Update timekeeper on a direct frequency change */ TK_ADV_FREQ }; /* * The most important data for readout fits into a single 64 byte * cache line. */ struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; struct timekeeper shadow_timekeeper; raw_spinlock_t lock; } ____cacheline_aligned; static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; /* The core timekeeper */ #define tk_core (timekeeper_data[TIMEKEEPER_CORE]) #ifdef CONFIG_POSIX_AUX_CLOCKS static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); } static inline bool tk_is_aux(const struct timekeeper *tk) { return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; } #else static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return false; } static inline bool tk_is_aux(const struct timekeeper *tk) { return false; } #endif static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) { tk->offs_aux = offs; tk->monotonic_to_aux = ktime_to_timespec64(offs); } /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit * is the index for the tk_read_base array * @base: tk_read_base array. Access is indexed by the lowest bit of * @seq. * * See @update_fast_timekeeper() below. */ struct tk_fast { seqcount_latch_t seq; struct tk_read_base base[2]; }; /* Suspend-time cycles value for halted fast timekeeper. */ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { if (timekeeping_suspended) return cycles_at_suspend; return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; /* * Boot time initialization which allows local_clock() to be utilized * during early boot when clocksources are not available. local_clock() * returns nanoseconds already so no conversion is required, hence mult=1 * and shift=0. When the first proper clocksource is installed then * the fast time keepers are updated with the correct values. */ #define FAST_TK_INIT \ { \ .clock = &dummy_clock, \ .mask = CLOCKSOURCE_MASK(64), \ .mult = 1, \ .shift = 0, \ } static struct tk_fast tk_fast_mono ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; #ifdef CONFIG_POSIX_AUX_CLOCKS static __init void tk_aux_setup(void); static void tk_aux_update_clocksource(void); static void tk_aux_advance(void); #else static inline void tk_aux_setup(void) { } static inline void tk_aux_update_clocksource(void) { } static inline void tk_aux_advance(void) { } #endif unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; raw_spin_lock_irqsave(&tk_core.lock, flags); return flags; } void timekeeper_unlock_irqrestore(unsigned long flags) { raw_spin_unlock_irqrestore(&tk_core.lock, flags); } /* * Multigrain timestamps require tracking the latest fine-grained timestamp * that has been issued, and never returning a coarse-grained timestamp that is * earlier than that value. * * mg_floor represents the latest fine-grained time that has been handed out as * a file timestamp on the system. This is tracked as a monotonic ktime_t, and * converted to a realtime clock value on an as-needed basis. * * Maintaining mg_floor ensures the multigrain interfaces never issue a * timestamp earlier than one that has been previously issued. * * The exception to this rule is when there is a backward realtime clock jump. If * such an event occurs, a timestamp can appear to be earlier than a previous one. */ static __cacheline_aligned_in_smp atomic64_t mg_floor; static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; tk->raw_sec++; } } static inline struct timespec64 tk_xtime(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = tk->coarse_nsec; return ts; } /* * Update the nanoseconds part for the coarse time keepers. They can't rely * on xtime_nsec because xtime_nsec could be adjusted by a small negative * amount when the multiplication factor of the clock is adjusted, which * could cause the coarse clocks to go slightly backwards. See * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse * clockids which only is updated when the clock has been set or we have * accumulated time. */ static inline void tk_update_coarse_nsecs(struct timekeeper *tk) { tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. */ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ static inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } /** * tk_setup_internals - Set up internals to use clocksource clock. * * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment * pair and interval request. * * Unless you're the timekeeping code, you should not be using this! */ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { u64 interval; u64 tmp, ntpinterval; struct clocksource *old_clock; ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.mask = clock->mask; tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; interval = (u64) tmp; tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; tk->tkr_raw.xtime_nsec >>= -shift_change; } else { tk->tkr_mono.xtime_nsec <<= shift_change; tk->tkr_raw.xtime_nsec <<= shift_change; } } tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) { return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; /* * This detects both negative motion and the case where the delta * overflows the multiplication with tkr->mult. */ if (unlikely(delta > tkr->clock->max_cycles)) { /* * Handle clocksource inconsistency between CPUs to prevent * time from going backwards by checking for the MSB of the * mask being set in the delta. */ if (delta & ~(mask >> 1)) return tkr->xtime_nsec >> tkr->shift; return delta_to_ns_safe(tkr, delta); } return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update * @tkf: Pointer to NMI safe timekeeper * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_fast *tkf) { struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += timekeeping_get_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * * This timestamp is not guaranteed to be monotonic across an update. * The timestamp is calculated by: * * now = base_mono + clock_delta * slope * * So if the update lowers the slope, readers who are forced to the * not yet updated second array are still using the old steeper slope. * * tmono * ^ * | o n * | o n * | u * | o * |o * |12345678---> reader order * * o = old slope * u = update * n = new slope * * So reader 6 will observe time going backwards versus reader 5. * * While other CPUs are likely to be able to observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); /** * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw * * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); /** * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset * is added to the old timekeeping making the clock appear to update slightly * earlier: * CPU 0 CPU 1 * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); /** * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. * * The same limitations as described for ktime_get_boot_fast_ns() apply. The * mono time and the TAI offset are not read atomically which may yield wrong * readouts. However, an update of the TAI offset is an rare event e.g., caused * by settime or adjtimex with an offset. The user of this function has to deal * with the possibility of wrong timestamps in post processing. */ u64 notrace ktime_get_tai_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. * * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return baser + delta; } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * * It generally is unsafe to access the clocksource after timekeeping has been * suspended, so take a snapshot of the readout base of @tk and use it as the * fast timekeeper's readout base while suspended. It will return the same * number of cycles every time until timekeeping is resumed at which time the * proper readout base for the fast timekeeper will be restored automatically. */ static void halt_fast_timekeeper(const struct timekeeper *tk) { static struct tk_read_base tkr_dummy; const struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener * @nb: Pointer to the notifier block to register */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; int ret; guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener * @nb: Pointer to the notifier block to unregister */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { guard(raw_spinlock_irqsave)(&tk_core.lock); return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* * tk_update_leap_state - helper to update the next_leap_ktime */ static inline void tk_update_leap_state(struct timekeeper *tk) { tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* * Leap state update for both shadow and the real timekeeper * Separate to spare a full memcpy() of the timekeeper. */ static void tk_update_leap_state_all(struct tk_data *tkd) { write_seqcount_begin(&tkd->seq); tk_update_leap_state(&tkd->shadow_timekeeper); tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; write_seqcount_end(&tkd->seq); } /* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) { u64 seconds; u32 nsec; /* * The xtime based monotonic readout is: * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); * The ktime based monotonic readout is: * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; /* Update the monotonic raw base */ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* * Restore the shadow timekeeper from the real timekeeper. */ static void timekeeping_restore_shadow(struct tk_data *tkd) { lockdep_assert_held(&tkd->lock); memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); } static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); /* * Block out readers before running the updates below because that * updates VDSO and other time related infrastructure. Not blocking * the readers might let a reader see time going backwards when * reading from the VDSO after the VDSO update and then reading in * the kernel from the timekeeper before that got updated. */ write_seqcount_begin(&tkd->seq); if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; if (tk->id == TIMEKEEPER_CORE) { update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } else if (tk_is_aux(tk)) { vdso_time_update_aux(tk); } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; /* * Update the real timekeeper. * * We could avoid this memcpy() by switching pointers, but that has * the downside that the reader side does not longer benefit from * the cacheline optimized data layout of the timekeeper and requires * another indirection. */ memcpy(&tkd->timekeeper, tk, sizeof(*tk)); write_seqcount_end(&tkd->seq); } /** * timekeeping_forward_now - update clock to the current time * @tk: Pointer to the timekeeper to update * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; while (delta > 0) { u64 max = tk->tkr_mono.clock->max_cycles; u64 incr = delta < max ? delta : max; tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; tk_normalize_xtime(tk); delta -= incr; } tk_update_coarse_nsecs(tk); } /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec64 (WARN if suspended). */ void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); u32 ktime_get_resolution_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u32 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return nsecs; } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t base, *offset = offsets[offs]; unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** * ktime_mono_to_any() - convert monotonic time to any other time * @tmono: time to convert. * @offs: which offset to use */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; if (IS_ENABLED(CONFIG_64BIT)) { /* * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and * tk_update_sleep_time(). */ return ktime_add(tmono, READ_ONCE(*offset)); } do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); } while (read_seqcount_retry(&tk_core.seq, seq)); return tconv; } EXPORT_SYMBOL_GPL(ktime_mono_to_any); /** * ktime_get_raw - Returns the raw monotonic time in ktime_t format */ ktime_t ktime_get_raw(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_raw.base; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_raw); /** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; unsigned int seq; u64 nsec; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; timespec64_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts64); /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non * serialized read. tk->ktime_sec is of type 'unsigned long' so this * works on both 32 and 64 bit systems. On 32 bit systems the readout * covers ~136 years of uptime which should be enough to prevent * premature wrap arounds. */ time64_t ktime_get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; WARN_ON(timekeeping_suspended); return tk->ktime_sec; } EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * * Returns the wall clock seconds since 1970. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence * counter to provide "atomic" access to the 64bit tk->xtime_sec * value. */ time64_t ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; time64_t seconds; unsigned int seq; if (IS_ENABLED(CONFIG_64BIT)) return tk->xtime_sec; do { seq = read_seqcount_begin(&tk_core.seq); seconds = tk->xtime_sec; } while (read_seqcount_retry(&tk_core.seq, seq)); return seconds; } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds * * The same as ktime_get_real_seconds() but without the sequence counter * protection. This function is used in restricted contexts like the x86 MCE * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half * completed modification and only to be used for such critical contexts. * * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } /** * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter * @systime_snapshot: pointer to struct receiving the system time snapshot */ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base_raw; ktime_t base_real; ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_id = tk->tkr_mono.clock->id; systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_boot = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); } while (read_seqcount_retry(&tk_core.seq, seq)); systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); /* Scale base by mult/div checking for overflow */ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) { u64 tmp, rem; tmp = div64_u64_rem(*base, div, &rem); if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } /** * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval * @history: Snapshot representing start of history * @partial_history_cycles: Cycle offset into history (fractional part) * @total_history_cycles: Total history length in cycles * @discontinuity: True indicates clock was set on history period * @ts: Cross timestamp that should be adjusted using * partial/total ratio * * Helper function used by get_device_system_crosststamp() to correct the * crosstimestamp corresponding to the start of the current interval to the * system counter value (timestamp point) provided by the driver. The * total_history_* quantities are the total history starting at the provided * reference point and ending at the start of the current interval. The cycle * count between the driver timestamp point and the start of the current * interval is partial_history_cycles. */ static int adjust_historical_crosststamp(struct system_time_snapshot *history, u64 partial_history_cycles, u64 total_history_cycles, bool discontinuity, struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 corr_raw, corr_real; bool interp_forward; int ret; if (total_history_cycles == 0 || partial_history_cycles == 0) return 0; /* Interpolate shortest distance from beginning or end of history */ interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; /* * Scale the monotonic raw time delta by: * partial_history_cycles / total_history_cycles */ corr_raw = (u64)ktime_to_ns( ktime_sub(ts->sys_monoraw, history->raw)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_raw); if (ret) return ret; /* * If there is a discontinuity in the history, scale monotonic raw * correction by: * mult(real)/mult(raw) yielding the realtime correction * Otherwise, calculate the realtime correction similar to monotonic * raw calculation */ if (discontinuity) { corr_real = mul_u64_u32_div (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); } else { corr_real = (u64)ktime_to_ns( ktime_sub(ts->sys_realtime, history->real)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_real); if (ret) return ret; } /* Fixup monotonic raw and real time time values */ if (interp_forward) { ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); ts->sys_realtime = ktime_add_ns(history->real, corr_real); } else { ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); } return 0; } /* * timestamp_in_interval - true if ts is chronologically in [start, end] * * True if ts occurs chronologically at or after start, and before or at end. */ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { if (ts >= start && ts <= end) return true; if (start > end && (ts >= start || ts <= end)) return true; return false; } static bool convert_clock(u64 *val, u32 numerator, u32 denominator) { u64 rem, res; if (!numerator || !denominator) return false; res = div64_u64_rem(*val, denominator, &rem) * numerator; *val = res + div_u64(rem * numerator, denominator); return true; } static bool convert_base_to_cs(struct system_counterval_t *scv) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; u32 num, den; /* The timestamp was taken from the time keeper clock source */ if (cs->id == scv->cs_id) return true; /* * Check whether cs_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != scv->cs_id) return false; num = scv->use_nsecs ? cs->freq_khz : base->numerator; den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; if (!convert_clock(&scv->cycles, num, den)) return false; scv->cycles += base->offset; return true; } static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; /* * Check whether base_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != base_id) return false; *cycles -= base->offset; if (!convert_clock(cycles, base->denominator, base->numerator)) return false; return true; } static bool convert_ns_to_cs(u64 *delta) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) return false; *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); return true; } /** * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp * @treal: CLOCK_REALTIME timestamp to convert * @base_id: base clocksource id * @cycles: pointer to store the converted base clock timestamp * * Converts a supplied, future realtime clock value to the corresponding base clock value. * * Return: true if the conversion is successful, false otherwise. */ bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 delta; do { seq = read_seqcount_begin(&tk_core.seq); if ((u64)treal < tk->tkr_mono.base_real) return false; delta = (u64)treal - tk->tkr_mono.base_real; if (!convert_ns_to_cs(&delta)) return false; *cycles = tk->tkr_mono.cycle_last + delta; if (!convert_cs_to_base(cycles, base_id)) return false; } while (read_seqcount_retry(&tk_core.seq, seq)); return true; } EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver * @ctx: Context passed to get_time_fn() * @history_begin: Historical reference point used to interpolate system * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time */ int get_device_system_crosststamp(int (*get_time_fn) (ktime_t *device_time, struct system_counterval_t *sys_counterval, void *ctx), void *ctx, struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned int seq; bool do_interp; int ret; do { seq = read_seqcount_begin(&tk_core.seq); /* * Try to synchronously capture device time and a system * counter value calling back into the device driver */ ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); if (ret) return ret; /* * Verify that the clocksource ID associated with the captured * system counter value is the same as for the currently * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; /* * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; do_interp = true; } else { do_interp = false; } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); /* * Interpolate if necessary, adjusting back from the start of the * current interval */ if (do_interp) { u64 partial_history_cycles, total_history_cycles; bool discontinuity; /* * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || !timestamp_in_interval(history_begin->cycles, cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; total_history_cycles = cycles - history_begin->cycles; discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq; ret = adjust_historical_crosststamp(history_begin, partial_history_cycles, total_history_cycles, discontinuity, xtstamp); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * timekeeping_clocksource_has_base - Check whether the current clocksource * is based on given a base clock * @id: base clocksource ID * * Note: The return value is a snapshot which can become invalid right * after the function returns. * * Return: true if the timekeeper clocksource has a base clock with @id, * false otherwise */ bool timekeeping_clocksource_has_base(enum clocksource_ids id) { /* * This is a snapshot, so no point in using the sequence * count. Just prevent the compiler from re-evaluating @base as the * clocksource might change concurrently. */ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); return base ? base->id == id : false; } EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ int do_settimeofday64(const struct timespec64 *ts) { struct timespec64 ts_delta, xt; if (!timespec64_valid_settod(ts)) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); xt = tk_xtime(tks); ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); tk_set_xtime(tks, ts); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); audit_tk_injoffset(ts_delta); add_device_randomness(ts, sizeof(*ts)); return 0; } EXPORT_SYMBOL(do_settimeofday64); static inline bool timekeeper_is_core_tk(struct timekeeper *tk) { return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; } /** * __timekeeping_inject_offset - Adds or subtracts from the current time. * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { struct timekeeper *tks = &tkd->shadow_timekeeper; struct timespec64 tmp; if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; timekeeping_forward_now(tks); if (timekeeper_is_core_tk(tks)) { /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); } else { struct tk_read_base *tkr_mono = &tks->tkr_mono; ktime_t now, offs; /* Get the current time */ now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); /* Add the relative offset change */ offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); /* Prevent that the resulting time becomes negative */ if (ktime_add(now, offs) < 0) { timekeeping_restore_shadow(tkd); return -EINVAL; } tk_update_aux_offs(tks, offs); } timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } static int timekeeping_inject_offset(const struct timespec64 *ts) { int ret; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) ret = __timekeeping_inject_offset(&tk_core, ts); /* Signal hrtimers about time change */ if (!ret) clock_was_set(CLOCK_SET_WALL); return ret; } /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. */ int persistent_clock_is_local; /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * * This is ugly, but preferable to the alternatives. Otherwise we * would either need to write a program to do it in /etc/rc (and risk * confusion if the program gets run more than once; it would also be * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; adjust.tv_nsec = 0; timekeeping_inject_offset(&adjust); } } /* * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /* * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource */ static int change_clocksource(void *data) { struct clocksource *new = data, *old = NULL; /* * If the clocksource is in a module, get a module reference. * Succeeds for built-in code (owner == NULL) as well. Abort if the * reference can't be acquired. */ if (!try_module_get(new->owner)) return 0; /* Abort if the device can't be enabled */ if (new->enable && new->enable(new) != 0) { module_put(new->owner); return 0; } scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); old = tks->tkr_mono.clock; tk_setup_internals(tks, new); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } tk_aux_update_clocksource(); if (old) { if (old->disable) old->disable(old); module_put(old->owner); } return 0; } /** * timekeeping_notify - Install a new clock source * @clock: pointer to the clock source * * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } /** * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_raw_ts64); /** * ktime_get_clock_ts64 - Returns time of a clock in a timespec * @id: POSIX clock ID of the clock to read * @ts: Pointer to the timespec64 to be set * * The timestamp is invalidated (@ts->sec is set to -1) if the * clock @id is not available. */ void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) { /* Invalidate time stamp */ ts->tv_sec = -1; ts->tv_nsec = 0; switch (id) { case CLOCK_REALTIME: ktime_get_real_ts64(ts); return; case CLOCK_MONOTONIC: ktime_get_ts64(ts); return; case CLOCK_MONOTONIC_RAW: ktime_get_raw_ts64(ts); return; case CLOCK_AUX ... CLOCK_AUX_LAST: if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) ktime_get_aux_ts64(id, ts); return; default: WARN_ON_ONCE(1); } } EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; int ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred */ u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * read_persistent_clock64 - Return time from the persistent clock. * @ts: Pointer to the storage for the readout value * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ void __weak read_persistent_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. * @wall_time: current time as returned by persistent clock * @boot_offset: offset that is defined as wall_time - boot_time * * Weak dummy function for arches that do not yet support it. * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); *boot_offset = ns_to_timespec64(local_clock()); } static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * * The flag starts of false and is only set when a suspend reaches * timekeeping_suspend(), timekeeping_resume() sets it to false when the * timekeeper clocksource is not stopping across suspend and has been * used to update sleep time. If the timekeeper clocksource has stopped * then the flag stays true and is used by the RTC resume code to decide * whether sleeptime must be injected and if so the flag gets false then. * * If a suspend fails before reaching timekeeping_resume() then the flag * stays false and prevents erroneous sleeptime injection. */ static bool suspend_timing_needed; /* Flag for if there is a persistent clock on this platform */ static bool persistent_clock_exists; /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } if (timespec64_compare(&wall_time, &boot_offset) < 0) boot_offset = (struct timespec64){0}; /* * We want set wall_to_mono, so the following is true: * wall time + wall_to_mono = boot time */ wall_to_mono = timespec64_sub(boot_offset, wall_time); guard(raw_spinlock_irqsave)(&tk_core.lock); ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); tk_setup_internals(tks, clock); tk_set_xtime(tks, &wall_time); tks->raw_sec = 0; tk_set_wall_to_mono(tks, wall_to_mono); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @tk: Pointer to the timekeeper to be updated * @delta: Pointer to the delta value in timespec64 format * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /* * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource * 2) persistent clock (ie: RTC accessible when irqs are off) * 3) RTC * * 1) and 2) are used by timekeeping, 3) by RTC subsystem. * If system has neither 1) nor 2), 3) will be used finally. * * * If timekeeping has injected sleeptime via either 1) or 2), * 3) becomes needless, so in this case we don't need to call * rtc_resume(), and this is what timekeeping_rtc_skipresume() * means. */ bool timekeeping_rtc_skipresume(void) { return !suspend_timing_needed; } /* * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). * * But if system has 2), 2) will definitely be used, so in this * case we don't need to call rtc_suspend(), and this is what * timekeeping_rtc_skipsuspend() means. */ bool timekeeping_rtc_skipsuspend(void) { return persistent_clock_exists; } /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; suspend_timing_needed = false; timekeeping_forward_now(tks); __timekeeping_inject_sleeptime(tks, delta); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. */ void timekeeping_resume(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; bool inject_sleeptime = false; u64 cycle_now, nsec; unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and * compensate it for the OS time. There are 3 sources that could be * used: Nonstop clocksource during suspend, persistent clock and rtc * device. * * One specific platform may have 1 or 2 or all of them, and the * preference will be: * suspend-nonstop clocksource -> persistent clock -> rtc * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); inject_sleeptime = true; } if (inject_sleeptime) { suspend_timing_needed = false; __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ tks->tkr_mono.cycle_last = cycle_now; tks->tkr_raw.cycle_last = cycle_now; tks->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); /* Resume the clockevent device(s) and hrtimers */ tick_resume(); /* Notify timerfd as resume is equivalent to clock_was_set() */ timerfd_resume(); } int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; struct clocksource *curr_clock; unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at * timekeeping_init by its return value, so if we see a valid * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) persistent_clock_exists = true; suspend_timing_needed = true; raw_spin_lock_irqsave(&tk_core.lock, flags); timekeeping_forward_now(tks); timekeeping_suspended = 1; /* * Since we've called forward_now, cycle_last stores the value * just read from the current clocksource. Save this to potentially * use in suspend timing. */ curr_clock = tks->tkr_mono.clock; cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, * which each can add ~1 second drift error, * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction * has occurred and set old_delta to the current delta. */ old_delta = delta; } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = timespec64_add(timekeeping_suspend_time, delta_delta); } } timekeeping_update_from_shadow(&tk_core, 0); halt_fast_timekeeper(tks); raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); clockevents_suspend(); return 0; } /* sysfs resume/suspend bits for timekeeping */ static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; static int __init timekeeping_init_ops(void) { register_syscore_ops(&timekeeping_syscore_ops); return 0; } device_initcall(timekeeping_init_ops); /* * Apply a multiplier adjustment to the timekeeper */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, s32 mult_adj) { s64 interval = tk->cycle_interval; if (mult_adj == 0) { return; } else if (mult_adj == -1) { interval = -interval; offset = -offset; } else if (mult_adj != 1) { interval *= mult_adj; offset *= mult_adj; } /* * So the following can be confusing. * * To keep things simple, lets assume mult_adj == 1 for now. * * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier * by one, this causes the xtime_interval to be incremented by * one cycle_interval. This is because: * xtime_interval = cycle_interval * mult * So if mult is being incremented by one: * xtime_interval = cycle_interval * (mult + 1) * Its the same as: * xtime_interval = (cycle_interval * mult) + cycle_interval * Which can be shortened to: * xtime_interval += cycle_interval * * So offset stores the non-accumulated cycles. Thus the current * time (in shifted nanoseconds) is: * now = (offset * adj) + xtime_nsec * Now, even though we're adjusting the clock frequency, we have * to keep time consistent. In other words, we can't jump back * in time, and we also want to avoid jumping forward in time. * * So given the same offset value, we need the time to be the same * both before and after the freq adjustment. * now = (offset * adj_1) + xtime_nsec_1 * now = (offset * adj_2) + xtime_nsec_2 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_2) + xtime_nsec_2 * And we know: * adj_2 = adj_1 + 1 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * (adj_1+1)) + xtime_nsec_2 * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_1) + offset + xtime_nsec_2 * Canceling the sides: * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplifies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; } /* * Adjust the timekeeper's multiplier to the correct frequency * and also to reduce the accumulated error value. */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } /* * If the clock is behind the NTP time, increase the multiplier by 1 * to catch up with it. If it's ahead and there was a remainder in the * tick division, the clock will slow down. Otherwise it will stay * ahead until the tick length changes to a non-divisible value. */ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; mult += tk->ntp_err_mult; timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * * Now, since we have already accumulated the second and the NTP * subsystem has been notified via second_overflow(), we need to skip * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec--; tk->skip_second_overflow = 1; } } /* * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* * Skip NTP update if this second was accumulated before, * i.e. xtime_nsec underflowed in timekeeping_adjust() */ if (unlikely(tk->skip_second_overflow)) { tk->skip_second_overflow = 0; continue; } /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_set = TK_CLOCK_WAS_SET; } } return clock_set; } /* * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. */ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; /* Accumulate one shifted interval */ offset -= interval; tk->tkr_mono.cycle_last += interval; tk->tkr_raw.cycle_last += interval; tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; tk->raw_sec++; } /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { struct timekeeper *tk = &tkd->shadow_timekeeper; struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset, orig_offset; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ clock_set |= accumulate_nsecs_to_secs(tk); /* * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls * making small negative adjustments to the base xtime_nsec * value, only update the coarse clocks if we accumulated time */ if (orig_offset != offset) tk_update_coarse_nsecs(tk); timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } static bool timekeeping_advance(enum timekeeping_adv_mode mode) { guard(raw_spinlock_irqsave)(&tk_core.lock); return __timekeeping_advance(&tk_core, mode); } /** * update_wall_time - Uses the current clocksource to increment the wall time * * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); tk_aux_advance(); } /** * getboottime64 - Return the real time of system boot. * @ts: pointer to the timespec64 to be set * * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); /** * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor * @ts: timespec64 to be filled * * Fetch the global mg_floor value, convert it to realtime and compare it * to the current coarse-grained time. Fill @ts with whichever is * latest. Note that this is a filesystem-specific interface and should be * avoided outside of that context. */ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 floor = atomic64_read(&mg_floor); ktime_t f_real, offset, coarse; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); coarse = timespec64_to_ktime(*ts); f_real = ktime_add(floor, offset); if (ktime_after(f_real, coarse)) *ts = ktime_to_timespec64(f_real); } /** * ktime_get_real_ts64_mg - attempt to update floor value and return result * @ts: pointer to the timespec to be set * * Get a monotonic fine-grained time value and attempt to swap it into * mg_floor. If that succeeds then accept the new floor value. If it fails * then another task raced in during the interim time and updated the * floor. Since any update to the floor must be later than the previous * floor, either outcome is acceptable. * * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), * and determining that the resulting coarse-grained timestamp did not effect * a change in ctime. Any more recent floor value would effect a change to * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. * * @ts will be filled with the latest floor value, regardless of the outcome of * the cmpxchg. Note that this is a filesystem specific interface and should be * avoided outside of that context. */ void ktime_get_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t old = atomic64_read(&mg_floor); ktime_t offset, mono; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; mono = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); mono = ktime_add_ns(mono, nsecs); /* * Attempt to update the floor with the new time value. As any * update must be later then the existing floor, and would effect * a change to ctime from the perspective of the current task, * accept the resulting floor value regardless of the outcome of * the swap. */ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); timekeeping_inc_mg_floor_swaps(); } else { /* * Another task changed mg_floor since "old" was fetched. * "old" has been updated with the latest value of "mg_floor". * That value is newer than the previous floor value, which * is enough to effect a change to ctime. Accept it. */ *ts = ktime_to_timespec64(ktime_add(old, offset)); } } void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { jiffies_64 += ticks; calc_global_load(); } /** * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the * sequence number in @cwsseq and timekeeper.clock_was_set_seq are * different. * * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); base = ktime_add_ns(base, nsecs); if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } /* Handle leapsecond insertion adjustments */ if (unlikely(base >= tk->next_leap_ktime)) *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); } while (read_seqcount_retry(&tk_core.seq, seq)); return base; } /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; } if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; /* * Validate if a timespec/timeval used to inject a time * offset is valid. Offsets can be positive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and * we can't have more nanoseconds/microseconds than a second. */ if (txc->time.tv_usec < 0) return -EINVAL; if (txc->modes & ADJ_NANO) { if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; } else { if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } /* * Check for potential multiplication overflows that can * only happen on 64-bit systems: */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } if (aux_clock) { /* Auxiliary clocks are similar to TAI and do not have leap seconds */ if (txc->status & (STA_INS | STA_DEL)) return -EINVAL; /* No TAI offset setting */ if (txc->modes & ADJ_TAI) return -EINVAL; /* No PPS support either */ if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) return -EINVAL; } return 0; } /** * random_get_entropy_fallback - Returns the raw clock source value, * used by random.c for platforms with no valid random_get_entropy(). */ unsigned long random_get_entropy_fallback(void) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; struct clocksource *clock = READ_ONCE(tkr->clock); if (unlikely(timekeeping_suspended || !clock)) return 0; return clock->read(clock); } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); struct adjtimex_result { struct audit_ntp_data ad; struct timespec64 delta; bool clock_set; }; static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, struct adjtimex_result *result) { struct timekeeper *tks = &tkd->shadow_timekeeper; bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); if (!aux_clock) ktime_get_real_ts64(&ts); else tk_get_aux_ts64(tkd->timekeeper.id, &ts); add_device_randomness(&ts, sizeof(ts)); guard(raw_spinlock_irqsave)(&tkd->lock); if (!tks->clock_valid) return -ENODEV; if (txc->modes & ADJ_SETOFFSET) { result->delta.tv_sec = txc->time.tv_sec; result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) result->delta.tv_nsec *= 1000; ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; result->clock_set = true; } orig_tai = tai = tks->tai_offset; ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); result->clock_set = true; } else { tk_update_leap_state_all(&tk_core); } /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); return ret; } /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { struct adjtimex_result result = { }; int ret; ret = __do_adjtimex(&tk_core, txc, &result); if (ret < 0) return ret; if (txc->modes & ADJ_SETOFFSET) audit_tk_injoffset(result.delta); audit_ntp_log(&result.ad); if (result.clock_set) clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } /* * Invoked from NTP with the time keeper lock held, so lockless access is * fine. */ long ktime_get_ntp_seconds(unsigned int id) { return timekeeper_data[id].timekeeper.xtime_sec; } #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function * @phase_ts: Pointer to timespec64 structure representing phase timestamp * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ #ifdef CONFIG_POSIX_AUX_CLOCKS #include "posix-timers.h" /* * Bitmap for the activated auxiliary timekeepers to allow lockless quick * checks in the hot paths without touching extra cache lines. If set, then * the state of the corresponding timekeeper has to be re-checked under * timekeeper::lock. */ static unsigned long aux_timekeepers; static inline unsigned int clockid_to_tkid(unsigned int id) { return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; } static inline struct tk_data *aux_get_tk_data(clockid_t id) { if (!clockid_aux_valid(id)) return NULL; return &timekeeper_data[clockid_to_tkid(id)]; } /* Invoked from timekeeping after a clocksource change */ static void tk_aux_update_clocksource(void) { unsigned long active = READ_ONCE(aux_timekeepers); unsigned int id; for_each_set_bit(id, &active, BITS_PER_LONG) { struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; struct timekeeper *tks = &tkd->shadow_timekeeper; guard(raw_spinlock_irqsave)(&tkd->lock); if (!tks->clock_valid) continue; timekeeping_forward_now(tks); tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); } } static void tk_aux_advance(void) { unsigned long active = READ_ONCE(aux_timekeepers); unsigned int id; /* Lockless quick check to avoid extra cache lines */ for_each_set_bit(id, &active, BITS_PER_LONG) { struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; guard(raw_spinlock)(&aux_tkd->lock); if (aux_tkd->shadow_timekeeper.clock_valid) __timekeeping_advance(aux_tkd, TK_ADV_TICK); } } /** * ktime_get_aux - Get time for a AUX clock * @id: ID of the clock to read (CLOCK_AUX...) * @kt: Pointer to ktime_t to store the time stamp * * Returns: True if the timestamp is valid, false otherwise */ bool ktime_get_aux(clockid_t id, ktime_t *kt) { struct tk_data *aux_tkd = aux_get_tk_data(id); struct timekeeper *aux_tk; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); if (!aux_tkd) return false; aux_tk = &aux_tkd->timekeeper; do { seq = read_seqcount_begin(&aux_tkd->seq); if (!aux_tk->clock_valid) return false; base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); } while (read_seqcount_retry(&aux_tkd->seq, seq)); *kt = ktime_add_ns(base, nsecs); return true; } EXPORT_SYMBOL_GPL(ktime_get_aux); /** * ktime_get_aux_ts64 - Get time for a AUX clock * @id: ID of the clock to read (CLOCK_AUX...) * @ts: Pointer to timespec64 to store the time stamp * * Returns: True if the timestamp is valid, false otherwise */ bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) { ktime_t now; if (!ktime_get_aux(id, &now)) return false; *ts = ktime_to_timespec64(now); return true; } EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); static int aux_get_res(clockid_t id, struct timespec64 *tp) { if (!clockid_aux_valid(id)) return -ENODEV; tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; return 0; } static int aux_get_timespec(clockid_t id, struct timespec64 *tp) { return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; } static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) { struct tk_data *aux_tkd = aux_get_tk_data(id); struct timekeeper *aux_tks; ktime_t tnow, nsecs; if (!timespec64_valid_settod(tnew)) return -EINVAL; if (!aux_tkd) return -ENODEV; aux_tks = &aux_tkd->shadow_timekeeper; guard(raw_spinlock_irq)(&aux_tkd->lock); if (!aux_tks->clock_valid) return -ENODEV; /* Forward the timekeeper base time */ timekeeping_forward_now(aux_tks); /* * Get the updated base time. tkr_mono.base has not been * updated yet, so do that first. That makes the update * in timekeeping_update_from_shadow() redundant, but * that's harmless. After that @tnow can be calculated * by using tkr_mono::cycle_last, which has been set * by timekeeping_forward_now(). */ tk_update_ktime_data(aux_tks); nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); /* * Calculate the new AUX offset as delta to @tnow ("monotonic"). * That avoids all the tk::xtime back and forth conversions as * xtime ("realtime") is not applicable for auxiliary clocks and * kept in sync with "monotonic". */ tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow)); timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); return 0; } static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) { struct tk_data *aux_tkd = aux_get_tk_data(id); struct adjtimex_result result = { }; if (!aux_tkd) return -ENODEV; /* * @result is ignored for now as there are neither hrtimers nor a * RTC related to auxiliary clocks for now. */ return __do_adjtimex(aux_tkd, txc, &result); } const struct k_clock clock_aux = { .clock_getres = aux_get_res, .clock_get_timespec = aux_get_timespec, .clock_set = aux_clock_set, .clock_adj = aux_clock_adj, }; static void aux_clock_enable(clockid_t id) { struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; struct tk_data *aux_tkd = aux_get_tk_data(id); struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; /* Prevent the core timekeeper from changing. */ guard(raw_spinlock_irq)(&tk_core.lock); /* * Setup the auxiliary clock assuming that the raw core timekeeper * clock frequency conversion is close enough. Userspace has to * adjust for the deviation via clock_adjtime(2). */ guard(raw_spinlock_nested)(&aux_tkd->lock); /* Remove leftovers of a previous registration */ memset(aux_tks, 0, sizeof(*aux_tks)); /* Restore the timekeeper id */ aux_tks->id = aux_tkd->timekeeper.id; /* Setup the timekeeper based on the current system clocksource */ tk_setup_internals(aux_tks, tkr_raw->clock); /* Mark it valid and set it live */ aux_tks->clock_valid = true; timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); } static void aux_clock_disable(clockid_t id) { struct tk_data *aux_tkd = aux_get_tk_data(id); guard(raw_spinlock_irq)(&aux_tkd->lock); aux_tkd->shadow_timekeeper.clock_valid = false; timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); } static DEFINE_MUTEX(aux_clock_mutex); static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { /* Lazy atoi() as name is "0..7" */ int id = kobj->name[0] & 0x7; bool enable; if (!capable(CAP_SYS_TIME)) return -EPERM; if (kstrtobool(buf, &enable) < 0) return -EINVAL; guard(mutex)(&aux_clock_mutex); if (enable == test_bit(id, &aux_timekeepers)) return count; if (enable) { aux_clock_enable(CLOCK_AUX + id); set_bit(id, &aux_timekeepers); } else { aux_clock_disable(CLOCK_AUX + id); clear_bit(id, &aux_timekeepers); } return count; } static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned long active = READ_ONCE(aux_timekeepers); /* Lazy atoi() as name is "0..7" */ int id = kobj->name[0] & 0x7; return sysfs_emit(buf, "%d\n", test_bit(id, &active)); } static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); static struct attribute *aux_clock_enable_attrs[] = { &aux_clock_enable_attr.attr, NULL }; static const struct attribute_group aux_clock_enable_attr_group = { .attrs = aux_clock_enable_attrs, }; static int __init tk_aux_sysfs_init(void) { struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); int ret = -ENOMEM; if (!tko) return ret; auxo = kobject_create_and_add("aux_clocks", tko); if (!auxo) goto err_clean; for (int i = 0; i < MAX_AUX_CLOCKS; i++) { char id[2] = { [0] = '0' + i, }; struct kobject *clk = kobject_create_and_add(id, auxo); if (!clk) { ret = -ENOMEM; goto err_clean; } ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); if (ret) goto err_clean; } return 0; err_clean: kobject_put(auxo); kobject_put(tko); return ret; } late_initcall(tk_aux_sysfs_init); static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) tkd_basic_setup(&timekeeper_data[i], i, false); } #endif /* CONFIG_POSIX_AUX_CLOCKS */
89 3 5 62 1 2 2 2 16 2 11 3 5 1 7 10 29 92 18 63 63 80 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 // SPDX-License-Identifier: GPL-2.0 /* * mm/fadvise.c * * Copyright (C) 2002, Linus Torvalds * * 11Jan2003 Andrew Morton * Initial version. */ #include <linux/kernel.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/fadvise.h> #include <linux/writeback.h> #include <linux/syscalls.h> #include <linux/swap.h> #include <asm/unistd.h> #include "internal.h" /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; inode = file_inode(file); if (S_ISFIFO(inode->i_mode)) return -ESPIPE; mapping = file->f_mapping; if (!mapping || len < 0) return -EINVAL; bdi = inode_to_bdi(mapping->host); if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_WILLNEED: case POSIX_FADV_NOREUSE: case POSIX_FADV_DONTNEED: /* no bad return value, but ignore advice */ break; default: return -EINVAL; } return 0; } /* * Careful about overflows. Len == 0 means "as much as possible". Use * unsigned math because signed overflows are undefined and UBSan * complains. */ endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) endbyte = LLONG_MAX; else endbyte--; /* inclusive */ switch (advice) { case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; spin_lock(&file->f_lock); file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE); spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: spin_lock(&file->f_lock); file->f_mode |= FMODE_RANDOM; spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: file->f_ra.ra_pages = bdi->ra_pages * 2; spin_lock(&file->f_lock); file->f_mode &= ~FMODE_RANDOM; spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ start_index = offset >> PAGE_SHIFT; end_index = endbyte >> PAGE_SHIFT; /* Careful about overflow on the "+1" */ nrpages = end_index - start_index + 1; if (!nrpages) nrpages = ~0UL; force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: spin_lock(&file->f_lock); file->f_mode |= FMODE_NOREUSE; spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: filemap_flush_range(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately * preserved on the expectation that it is better to preserve * needed memory than to discard unneeded memory. */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); /* * The page at end_index will be inclusively discarded according * by invalidate_mapping_pages(), so subtracting 1 from * end_index means we will skip the last page. But if endbyte * is page aligned or is at the end of file, we should not skip * that page - discarding the last page is safe enough. */ if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK && endbyte != inode->i_size - 1) { /* First page is tricky as 0 - 1 = -1, but pgoff_t * is unsigned, so the end_index >= start_index * check below would be true and we'll discard the whole * file cache which is not what was asked. */ if (end_index == 0) break; end_index--; } if (end_index >= start_index) { unsigned long nr_failed = 0; /* * It's common to FADV_DONTNEED right after * the read or write that instantiates the * pages, in which case there will be some * sitting on the local LRU cache. Try to * avoid the expensive remote drain and the * second cache tree walk below by flushing * them out right away. */ lru_add_drain(); mapping_try_invalidate(mapping, start_index, end_index, &nr_failed); /* * The failures may be due to the folio being * in the LRU cache of a remote CPU. Drain all * caches and try again. */ if (nr_failed) { lru_add_drain_all(); invalidate_mapping_pages(mapping, start_index, end_index); } } break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(generic_fadvise); int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { if (file->f_op->fadvise) return file->f_op->fadvise(file, offset, len, advice); return generic_fadvise(file, offset, len, advice); } EXPORT_SYMBOL(vfs_fadvise); #ifdef CONFIG_ADVISE_SYSCALLS int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fadvise(fd_file(f), offset, len, advice); } SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) { return ksys_fadvise64_64(fd, offset, len, advice); } #ifdef __ARCH_WANT_SYS_FADVISE64 SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) { return ksys_fadvise64_64(fd, offset, len, advice); } #endif #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FADVISE64_64) COMPAT_SYSCALL_DEFINE6(fadvise64_64, int, fd, compat_arg_u64_dual(offset), compat_arg_u64_dual(len), int, advice) { return ksys_fadvise64_64(fd, compat_arg_u64_glue(offset), compat_arg_u64_glue(len), advice); } #endif #endif
535 535 10 121 2 247 2 7 83 535 534 535 535 3 3 3 3 3 3 3 3 5 18 2 14 1685 1687 28 1684 1686 1686 1686 964 1690 117 1684 1685 1384 88 90 151 151 151 151 151 151 151 45 61 61 61 60 59 58 57 56 54 54 53 51 1 51 49 46 4 46 44 1 45 15 8 7 3 3 3 201 159 3 39 3 170 2 28 4 173 23 195 183 12 184 12 179 3 13 188 5 189 5 180 15 188 185 5 178 10 2 12 171 5 3 5 13 159 153 13 148 6 6 10 2 157 2 137 8 8 6 7 148 1 148 23 127 148 1 138 5 6 137 4 7 56 56 45 24 14 1 1 1 1 1 1 3 1 1 5 117 1 51 1 1 23 91 92 27 31 53 4 64 11 60 11 62 25 25 17 55 10 51 28 62 2 65 65 28 118 96 1 46 1 1 31 62 1 61 4 2 26 27 1 28 27 1 27 2 2 2 1 28 28 34 95 7 83 94 86 8 86 6 6 88 5 3 28 77 87 7 94 8 84 11 91 3 91 3 90 2 87 9 93 1 88 8 90 10 89 89 5 93 4 4 4 2 1 4 3 7 2 2 15 12 3 2 2 2 2 2 2 89 90 88 2801 2798 2799 1655 1653 1 1654 4440 4442 4380 2691 2691 116 116 116 1541 1541 535 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(const struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex, const struct flowi *fl) { u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master); return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) : fl->flowi_iif == iifindex; } static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex, const struct flowi *fl) { u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master); return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) : fl->flowi_oif == oifindex; } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int iifindex, oifindex, ret = 0; iifindex = READ_ONCE(rule->iifindex); if (iifindex && !fib_rule_iif_match(rule, iifindex, fl)) goto out; oifindex = READ_ONCE(rule->oifindex); if (oifindex && !fib_rule_oif_match(rule, oifindex, fl)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ASSERT_RTNL_NET(net); /* Paired with READ_ONCE() in fib_rules_seq() */ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ops = lookup_rules_ops(net, family); if (!ops) return 0; /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (rule->sport_mask && r->sport_mask != rule->sport_mask) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (rule->dport_mask && r->dport_mask != rule->dport_mask) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, const struct fib_rule_port_range *range, u16 *port_mask, struct netlink_ext_ack *extack) { if (!fib_rule_port_range_valid(range)) { NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Cannot specify port mask without port value"); return -EINVAL; } if (fib_rule_port_is_range(range)) { NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Cannot specify port mask for port range"); return -EINVAL; } if (range->start & ~nla_get_u16(mask_attr)) { NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); return -EINVAL; } *port_mask = nla_get_u16(mask_attr); return 0; } static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); } if (tb[FRA_OIFNAME]) { nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } if (!fib_rule_port_is_range(&nlrule->sport_range)) nlrule->sport_mask = U16_MAX; } if (tb[FRA_SPORT_MASK]) { err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], &nlrule->sport_range, &nlrule->sport_mask, extack); if (err) goto errout_free; } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } if (!fib_rule_port_is_range(&nlrule->dport_range)) nlrule->dport_mask = U16_MAX; } if (tb[FRA_DPORT_MASK]) { err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], &nlrule->dport_range, &nlrule->dport_mask, extack); if (err) goto errout_free; } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int fib_nl2rule_rtnl(struct fib_rule *nlrule, struct fib_rules_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (!tb[FRA_PRIORITY]) nlrule->pref = fib_default_rule_pref(ops); /* Backward jumps are prohibited to avoid endless loops */ if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); return -EINVAL; } if (tb[FRA_IIFNAME]) { struct net_device *dev; dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname); if (dev) { nlrule->iifindex = dev->ifindex; nlrule->iif_is_l3_master = netif_is_l3_master(dev); } } if (tb[FRA_OIFNAME]) { struct net_device *dev; dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname); if (dev) { nlrule->oifindex = dev->ifindex; nlrule->oif_is_l3_master = netif_is_l3_master(dev); } } return 0; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (r->sport_mask != rule->sport_mask) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (r->dport_mask != rule->dport_mask) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_PRIORITY] = { .type = NLA_U32 }, [FRA_FWMARK] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, [FRA_TUN_ID] = { .type = NLA_U64 }, [FRA_FWMASK] = { .type = NLA_U32 }, [FRA_TABLE] = { .type = NLA_U32 }, [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, [FRA_GOTO] = { .type = NLA_U32 }, [FRA_L3MDEV] = { .type = NLA_U8 }, [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), [FRA_FLOWLABEL] = { .type = NLA_BE32 }, [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, [FRA_SPORT_MASK] = { .type = NLA_U16 }, [FRA_DPORT_MASK] = { .type = NLA_U16 }, [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2), }; int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *r, *last = NULL; int err = -EINVAL, unresolved = 0; struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; struct fib_rule_hdr *frh; frh = nlmsg_payload(nlh, sizeof(*frh)); if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if (!rtnl_held) rtnl_net_lock(net); err = fib_nl2rule_rtnl(rule, ops, tb, extack); if (err) goto errout_free; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); fib_rule_get(rule); if (!rtnl_held) rtnl_net_unlock(net); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: if (!rtnl_held) rtnl_net_unlock(net); kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_newrule); static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false); } int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX+1]; bool user_priority = false; struct fib_rule_hdr *frh; int err = -EINVAL; frh = nlmsg_payload(nlh, sizeof(*frh)); if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; if (!rtnl_held) rtnl_net_lock(net); err = fib_nl2rule_rtnl(nlrule, ops, tb, extack); if (err) goto errout_free; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout_free; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout_free; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout_free; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n, *r; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); if (!rtnl_held) rtnl_net_unlock(net); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout_free: if (!rtnl_held) rtnl_net_unlock(net); kfree(nlrule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_delrule); static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false); } static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + nla_total_size(2) /* FRA_SPORT_MASK */ + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, rule->sport_mask)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, rule->dport_mask)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; frh = nlmsg_payload(nlh, sizeof(*frh)); if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int err, idx = 0, family; if (cb->strict_check) { err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; return dump_rules(skb, cb, ops); } err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; err = dump_rules(skb, cb, ops); if (err < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return err; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) { WRITE_ONCE(rule->iifindex, dev->ifindex); WRITE_ONCE(rule->iif_is_l3_master, netif_is_l3_master(dev)); } if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) { WRITE_ONCE(rule->oifindex, dev->ifindex); WRITE_ONCE(rule->oif_is_l3_master, netif_is_l3_master(dev)); } } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) { WRITE_ONCE(rule->iifindex, -1); WRITE_ONCE(rule->iif_is_l3_master, false); } if (rule->oifindex == dev->ifindex) { WRITE_ONCE(rule->oifindex, -1); WRITE_ONCE(rule->oif_is_l3_master, false); } } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init fib_rules_init(void) { int err; rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } subsys_initcall(fib_rules_init);
1 1 535 535 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("ip6tables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_FILTER, }; static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static bool forward = true; module_param(forward, bool, 0000); static int ip6table_filter_table_init(struct net *net) { struct ip6t_replace *repl; int err; repl = ip6t_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ip6t_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } static int __net_init ip6table_filter_net_init(struct net *net) { if (!forward) return ip6table_filter_table_init(net); return 0; } static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { ip6t_unregister_table_pre_exit(net, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "filter"); } static struct pernet_operations ip6table_filter_net_ops = { .init = ip6table_filter_net_init, .pre_exit = ip6table_filter_net_pre_exit, .exit = ip6table_filter_net_exit, }; static int __init ip6table_filter_init(void) { int ret = xt_register_template(&packet_filter, ip6table_filter_table_init); if (ret < 0) return ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); if (IS_ERR(filter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); } ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) { xt_unregister_template(&packet_filter); kfree(filter_ops); return ret; } return ret; } static void __exit ip6table_filter_fini(void) { unregister_pernet_subsys(&ip6table_filter_net_ops); xt_unregister_template(&packet_filter); kfree(filter_ops); } module_init(ip6table_filter_init); module_exit(ip6table_filter_fini);
193 194 195 2 2 4 4 4 6 6 4437 4266 195 7 2 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 // SPDX-License-Identifier: GPL-2.0-or-later /* * "LAPB via ethernet" driver release 001 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * This is a "pseudo" network driver to allow LAPB over Ethernet. * * This driver can use any ethernet destination address, and can be * limited to accept frames from one dedicated ethernet card only. * * History * LAPBETH 001 Jonathan Naylor Cloned from bpqether.c * 2000-10-29 Henner Eisen lapb_data_indication() return status. * 2000-11-14 Henner Eisen dev_hold/put, NETDEV_GOING_DOWN support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/stat.h> #include <linux/module.h> #include <linux/lapb.h> #include <linux/init.h> #include <net/netdev_lock.h> #include <net/x25device.h> static const u8 bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; /* If this number is made larger, check that the temporary string buffer * in lapbeth_new_device is large enough to store the probe device name. */ #define MAXLAPBDEV 100 struct lapbethdev { struct list_head node; struct net_device *ethdev; /* link to ethernet device */ struct net_device *axdev; /* lapbeth device (lapb#) */ bool up; spinlock_t up_lock; /* Protects "up" */ struct sk_buff_head rx_queue; struct napi_struct napi; }; static LIST_HEAD(lapbeth_devices); static void lapbeth_connected(struct net_device *dev, int reason); static void lapbeth_disconnected(struct net_device *dev, int reason); /* ------------------------------------------------------------------------ */ /* Get the LAPB device for the ethernet device */ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) { struct lapbethdev *lapbeth; list_for_each_entry_rcu(lapbeth, &lapbeth_devices, node, lockdep_rtnl_is_held()) { if (lapbeth->ethdev == dev) return lapbeth; } return NULL; } static __inline__ int dev_is_ethdev(struct net_device *dev) { return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev); } /* ------------------------------------------------------------------------ */ static int lapbeth_napi_poll(struct napi_struct *napi, int budget) { struct lapbethdev *lapbeth = container_of(napi, struct lapbethdev, napi); struct sk_buff *skb; int processed = 0; for (; processed < budget; ++processed) { skb = skb_dequeue(&lapbeth->rx_queue); if (!skb) break; netif_receive_skb_core(skb); } if (processed < budget) napi_complete(napi); return processed; } /* Receive a LAPB frame via an ethernet interface. */ static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; if (dev_net(dev) != &init_net) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (!pskb_may_pull(skb, 2)) goto drop; rcu_read_lock(); lapbeth = lapbeth_get_x25_dev(dev); if (!lapbeth) goto drop_unlock_rcu; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop_unlock; len = skb->data[0] + skb->data[1] * 256; dev->stats.rx_packets++; dev->stats.rx_bytes += len; skb_pull(skb, 2); /* Remove the length bytes */ skb_trim(skb, len); /* Set the length of the data */ err = lapb_data_received(lapbeth->axdev, skb); if (err != LAPB_OK) { printk(KERN_DEBUG "lapbether: lapb_data_received err - %d\n", err); goto drop_unlock; } out: spin_unlock_bh(&lapbeth->up_lock); rcu_read_unlock(); return 0; drop_unlock: kfree_skb(skb); goto out; drop_unlock_rcu: rcu_read_unlock(); drop: kfree_skb(skb); return 0; } static int lapbeth_data_indication(struct net_device *dev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; if (skb_cow(skb, 1)) { kfree_skb(skb); return NET_RX_DROP; } skb_push(skb, 1); ptr = skb->data; *ptr = X25_IFACE_DATA; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); return NET_RX_SUCCESS; } /* Send a LAPB frame via an ethernet interface */ static netdev_tx_t lapbeth_xmit(struct sk_buff *skb, struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop; /* There should be a pseudo header of 1 byte added by upper layers. * Check to make sure it is there before reading it. */ if (skb->len < 1) goto drop; switch (skb->data[0]) { case X25_IFACE_DATA: break; case X25_IFACE_CONNECT: err = lapb_connect_request(dev); if (err == LAPB_CONNECTED) lapbeth_connected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_connect_request error: %d\n", err); goto drop; case X25_IFACE_DISCONNECT: err = lapb_disconnect_request(dev); if (err == LAPB_NOTCONNECTED) lapbeth_disconnected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_disconnect_request err: %d\n", err); fallthrough; default: goto drop; } skb_pull(skb, 1); err = lapb_data_request(dev, skb); if (err != LAPB_OK) { pr_err("lapb_data_request error - %d\n", err); goto drop; } out: spin_unlock_bh(&lapbeth->up_lock); return NETDEV_TX_OK; drop: kfree_skb(skb); goto out; } static void lapbeth_data_transmit(struct net_device *ndev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(ndev); unsigned char *ptr; struct net_device *dev; int size = skb->len; ptr = skb_push(skb, 2); *ptr++ = size % 256; *ptr++ = size / 256; ndev->stats.tx_packets++; ndev->stats.tx_bytes += size; skb->dev = dev = lapbeth->ethdev; skb->protocol = htons(ETH_P_DEC); skb_reset_network_header(skb); dev_hard_header(skb, dev, ETH_P_DEC, bcast_addr, NULL, 0); dev_queue_xmit(skb); } static void lapbeth_connected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_CONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } static void lapbeth_disconnected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_DISCONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } /* Set AX.25 callsign */ static int lapbeth_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; dev_addr_set(dev, sa->sa_data); return 0; } static const struct lapb_register_struct lapbeth_callbacks = { .connect_confirmation = lapbeth_connected, .connect_indication = lapbeth_connected, .disconnect_confirmation = lapbeth_disconnected, .disconnect_indication = lapbeth_disconnected, .data_indication = lapbeth_data_indication, .data_transmit = lapbeth_data_transmit, }; /* open/close a device */ static int lapbeth_open(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; napi_enable(&lapbeth->napi); err = lapb_register(dev, &lapbeth_callbacks); if (err != LAPB_OK) { napi_disable(&lapbeth->napi); pr_err("lapb_register error: %d\n", err); return -ENODEV; } spin_lock_bh(&lapbeth->up_lock); lapbeth->up = true; spin_unlock_bh(&lapbeth->up_lock); return 0; } static int lapbeth_close(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); lapbeth->up = false; spin_unlock_bh(&lapbeth->up_lock); err = lapb_unregister(dev); if (err != LAPB_OK) pr_err("lapb_unregister error: %d\n", err); napi_disable(&lapbeth->napi); return 0; } /* ------------------------------------------------------------------------ */ static const struct net_device_ops lapbeth_netdev_ops = { .ndo_open = lapbeth_open, .ndo_stop = lapbeth_close, .ndo_start_xmit = lapbeth_xmit, .ndo_set_mac_address = lapbeth_set_mac_address, }; static void lapbeth_setup(struct net_device *dev) { netdev_lockdep_set_classes(dev); dev->netdev_ops = &lapbeth_netdev_ops; dev->needs_free_netdev = true; dev->type = ARPHRD_X25; dev->hard_header_len = 0; dev->mtu = 1000; dev->addr_len = 0; } /* Setup a new device. */ static int lapbeth_new_device(struct net_device *dev) { struct net_device *ndev; struct lapbethdev *lapbeth; int rc = -ENOMEM; ASSERT_RTNL(); if (dev->type != ARPHRD_ETHER) return -EINVAL; ndev = alloc_netdev(sizeof(*lapbeth), "lapb%d", NET_NAME_UNKNOWN, lapbeth_setup); if (!ndev) goto out; /* When transmitting data: * first this driver removes a pseudo header of 1 byte, * then the lapb module prepends an LAPB header of at most 3 bytes, * then this driver prepends a length field of 2 bytes, * then the underlying Ethernet device prepends its own header. */ ndev->needed_headroom = -1 + 3 + 2 + dev->hard_header_len + dev->needed_headroom; ndev->needed_tailroom = dev->needed_tailroom; lapbeth = netdev_priv(ndev); lapbeth->axdev = ndev; dev_hold(dev); lapbeth->ethdev = dev; lapbeth->up = false; spin_lock_init(&lapbeth->up_lock); skb_queue_head_init(&lapbeth->rx_queue); netif_napi_add_weight(ndev, &lapbeth->napi, lapbeth_napi_poll, 16); rc = -EIO; if (register_netdevice(ndev)) goto fail; list_add_rcu(&lapbeth->node, &lapbeth_devices); rc = 0; out: return rc; fail: dev_put(dev); free_netdev(ndev); goto out; } /* Free a lapb network device. */ static void lapbeth_free_device(struct lapbethdev *lapbeth) { dev_put(lapbeth->ethdev); list_del_rcu(&lapbeth->node); unregister_netdevice(lapbeth->axdev); } /* Handle device status changes. * * Called from notifier with RTNL held. */ static int lapbeth_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct lapbethdev *lapbeth; struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; if (!dev_is_ethdev(dev) && !lapbeth_get_x25_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: /* New ethernet device -> new LAPB interface */ if (!lapbeth_get_x25_dev(dev)) lapbeth_new_device(dev); break; case NETDEV_GOING_DOWN: /* ethernet device closes -> close LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) dev_close(lapbeth->axdev); break; case NETDEV_UNREGISTER: /* ethernet device disappears -> remove LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) lapbeth_free_device(lapbeth); break; } return NOTIFY_DONE; } /* ------------------------------------------------------------------------ */ static struct packet_type lapbeth_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DEC), .func = lapbeth_rcv, }; static struct notifier_block lapbeth_dev_notifier = { .notifier_call = lapbeth_device_event, }; static const char banner[] __initconst = KERN_INFO "LAPB Ethernet driver version 0.02\n"; static int __init lapbeth_init_driver(void) { dev_add_pack(&lapbeth_packet_type); register_netdevice_notifier(&lapbeth_dev_notifier); printk(banner); return 0; } module_init(lapbeth_init_driver); static void __exit lapbeth_cleanup_driver(void) { struct lapbethdev *lapbeth; struct list_head *entry, *tmp; dev_remove_pack(&lapbeth_packet_type); unregister_netdevice_notifier(&lapbeth_dev_notifier); rtnl_lock(); list_for_each_safe(entry, tmp, &lapbeth_devices) { lapbeth = list_entry(entry, struct lapbethdev, node); dev_put(lapbeth->ethdev); unregister_netdevice(lapbeth->axdev); } rtnl_unlock(); } module_exit(lapbeth_cleanup_driver); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The unofficial LAPB over Ethernet driver"); MODULE_LICENSE("GPL");
2509 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Access to user system call parameters and results * * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * See asm-generic/syscall.h for descriptions of what we must do here. */ #ifndef _ASM_X86_SYSCALL_H #define _ASM_X86_SYSCALL_H #include <uapi/linux/audit.h> #include <linux/sched.h> #include <linux/err.h> #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> /* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); extern long x32_sys_call(const struct pt_regs *, unsigned int nr); extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons * sign-extend the low 32 bits. */ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { return regs->orig_ax; } static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) { regs->orig_ax = nr; } static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { regs->ax = regs->orig_ax; } static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { unsigned long error = regs->ax; #ifdef CONFIG_IA32_EMULATION /* * TS_COMPAT is set for 32-bit syscall entries and then * remains set until we return to user mode. */ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) /* * Sign-extend the value so (int)-EFOO becomes (long)-EFOO * and will match correctly in comparisons. */ error = (long) (int) error; #endif return IS_ERR_VALUE(error) ? error : 0; } static inline long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs) { return regs->ax; } static inline void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val) { regs->ax = (long) error ?: val; } #ifdef CONFIG_X86_32 static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { args[0] = regs->bx; args[1] = regs->cx; args[2] = regs->dx; args[3] = regs->si; args[4] = regs->di; args[5] = regs->bp; } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args) { regs->bx = args[0]; regs->cx = args[1]; regs->dx = args[2]; regs->si = args[3]; regs->di = args[4]; regs->bp = args[5]; } static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; } #else /* CONFIG_X86_64 */ static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { # ifdef CONFIG_IA32_EMULATION if (task->thread_info.status & TS_COMPAT) { *args++ = regs->bx; *args++ = regs->cx; *args++ = regs->dx; *args++ = regs->si; *args++ = regs->di; *args = regs->bp; } else # endif { *args++ = regs->di; *args++ = regs->si; *args++ = regs->dx; *args++ = regs->r10; *args++ = regs->r8; *args = regs->r9; } } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args) { # ifdef CONFIG_IA32_EMULATION if (task->thread_info.status & TS_COMPAT) { regs->bx = *args++; regs->cx = *args++; regs->dx = *args++; regs->si = *args++; regs->di = *args++; regs->bp = *args; } else # endif { regs->di = *args++; regs->si = *args++; regs->dx = *args++; regs->r10 = *args++; regs->r8 = *args++; regs->r9 = *args; } } static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ return (IS_ENABLED(CONFIG_IA32_EMULATION) && task->thread_info.status & TS_COMPAT) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } bool do_syscall_64(struct pt_regs *regs, int nr); void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ void do_int80_syscall_32(struct pt_regs *regs); bool do_fast_syscall_32(struct pt_regs *regs); bool do_SYSENTER_32(struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_H */
5 5 5 6 7 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* * net/tipc/diag.c: TIPC socket diag * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "socket.h" #include <linux/sock_diag.h> #include <linux/tipc_sockets_diag.h> static u64 __tipc_diag_gen_cookie(struct sock *sk) { u32 res[2]; sock_diag_save_cookie(sk, res); return *((u64 *)res); } static int __tipc_add_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); struct nlmsghdr *nlh; int err; nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, NLM_F_MULTI); if (!nlh) return -EMSGSIZE; err = tipc_sk_fill_sock_diag(skb, cb, tsk, req->tidiag_states, __tipc_diag_gen_cookie); if (err) return err; nlmsg_end(skb, nlh); return 0; } static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); } static int tipc_sock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct tipc_sock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = tipc_dump_start, .dump = tipc_diag_dump, .done = tipc_dump_done, }; netlink_dump_start(net->diag_nlsk, skb, h, &c); return 0; } return -EOPNOTSUPP; } static const struct sock_diag_handler tipc_sock_diag_handler = { .owner = THIS_MODULE, .family = AF_TIPC, .dump = tipc_sock_diag_handler_dump, }; static int __init tipc_diag_init(void) { return sock_diag_register(&tipc_sock_diag_handler); } static void __exit tipc_diag_exit(void) { sock_diag_unregister(&tipc_sock_diag_handler); } module_init(tipc_diag_init); module_exit(tipc_diag_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("TIPC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC);
6264 6262 6262 6264 5863 5628 5864 1023 1025 1023 1025 642 142 586 611 503 1024 1023 5791 5795 5780 1721 44 1726 5794 5793 525 526 526 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/plist.c * * Descending-priority-sorted double-linked list * * (C) 2002-2003 Intel Corp * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. * * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker <dwalker@mvista.com> * * (C) 2005 Thomas Gleixner <tglx@linutronix.de> * * Simplifications of the original code by * Oleg Nesterov <oleg@tv-sign.ru> * * Based on simple lists (include/linux/list.h). * * This file contains the add / del functions which are considered to * be too large to inline. See include/linux/plist.h for further * information. */ #include <linux/bug.h> #include <linux/plist.h> #ifdef CONFIG_DEBUG_PLIST static struct plist_head test_head; static void plist_check_prev_next(struct list_head *t, struct list_head *p, struct list_head *n) { WARN(n->prev != p || p->next != n, "top: %p, n: %p, p: %p\n" "prev: %p, n: %p, p: %p\n" "next: %p, n: %p, p: %p\n", t, t->next, t->prev, p, p->next, p->prev, n, n->next, n->prev); } static void plist_check_list(struct list_head *top) { struct list_head *prev = top, *next = top->next; plist_check_prev_next(top, prev, next); while (next != top) { WRITE_ONCE(prev, next); WRITE_ONCE(next, prev->next); plist_check_prev_next(top, prev, next); } } static void plist_check_head(struct plist_head *head) { if (!plist_head_empty(head)) plist_check_list(&plist_first(head)->prio_list); plist_check_list(&head->node_list); } #else # define plist_check_head(h) do { } while (0) #endif /** * plist_add - add @node to @head * * @node: &struct plist_node pointer * @head: &struct plist_head pointer */ void plist_add(struct plist_node *node, struct plist_head *head) { struct plist_node *first, *iter, *prev = NULL, *last, *reverse_iter; struct list_head *node_next = &head->node_list; plist_check_head(head); WARN_ON(!plist_node_empty(node)); WARN_ON(!list_empty(&node->prio_list)); if (plist_head_empty(head)) goto ins_node; first = iter = plist_first(head); last = reverse_iter = list_entry(first->prio_list.prev, struct plist_node, prio_list); do { if (node->prio < iter->prio) { node_next = &iter->node_list; break; } else if (node->prio >= reverse_iter->prio) { prev = reverse_iter; iter = list_entry(reverse_iter->prio_list.next, struct plist_node, prio_list); if (likely(reverse_iter != last)) node_next = &iter->node_list; break; } prev = iter; iter = list_entry(iter->prio_list.next, struct plist_node, prio_list); reverse_iter = list_entry(reverse_iter->prio_list.prev, struct plist_node, prio_list); } while (iter != first); if (!prev || prev->prio != node->prio) list_add_tail(&node->prio_list, &iter->prio_list); ins_node: list_add_tail(&node->node_list, node_next); plist_check_head(head); } /** * plist_del - Remove a @node from plist. * * @node: &struct plist_node pointer - entry to be removed * @head: &struct plist_head pointer - list head */ void plist_del(struct plist_node *node, struct plist_head *head) { plist_check_head(head); if (!list_empty(&node->prio_list)) { if (node->node_list.next != &head->node_list) { struct plist_node *next; next = list_entry(node->node_list.next, struct plist_node, node_list); /* add the next plist_node into prio_list */ if (list_empty(&next->prio_list)) list_add(&next->prio_list, &node->prio_list); } list_del_init(&node->prio_list); } list_del_init(&node->node_list); plist_check_head(head); } /** * plist_requeue - Requeue @node at end of same-prio entries. * * This is essentially an optimized plist_del() followed by * plist_add(). It moves an entry already in the plist to * after any other same-priority entries. * * @node: &struct plist_node pointer - entry to be moved * @head: &struct plist_head pointer - list head */ void plist_requeue(struct plist_node *node, struct plist_head *head) { struct plist_node *iter; struct list_head *node_next = &head->node_list; plist_check_head(head); BUG_ON(plist_head_empty(head)); BUG_ON(plist_node_empty(node)); if (node == plist_last(head)) return; iter = plist_next(node); if (node->prio != iter->prio) return; plist_del(node, head); /* * After plist_del(), iter is the replacement of the node. If the node * was on prio_list, take shortcut to find node_next instead of looping. */ if (!list_empty(&iter->prio_list)) { iter = list_entry(iter->prio_list.next, struct plist_node, prio_list); node_next = &iter->node_list; goto queue; } plist_for_each_continue(iter, head) { if (node->prio != iter->prio) { node_next = &iter->node_list; break; } } queue: list_add_tail(&node->node_list, node_next); plist_check_head(head); } #ifdef CONFIG_DEBUG_PLIST #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/module.h> #include <linux/init.h> static struct plist_node __initdata test_node[241]; static void __init plist_test_check(int nr_expect) { struct plist_node *first, *prio_pos, *node_pos; if (plist_head_empty(&test_head)) { BUG_ON(nr_expect != 0); return; } prio_pos = first = plist_first(&test_head); plist_for_each(node_pos, &test_head) { if (nr_expect-- < 0) break; if (node_pos == first) continue; if (node_pos->prio == prio_pos->prio) { BUG_ON(!list_empty(&node_pos->prio_list)); continue; } BUG_ON(prio_pos->prio > node_pos->prio); BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list); prio_pos = node_pos; } BUG_ON(nr_expect != 0); BUG_ON(prio_pos->prio_list.next != &first->prio_list); } static void __init plist_test_requeue(struct plist_node *node) { plist_requeue(node, &test_head); if (node != plist_last(&test_head)) BUG_ON(node->prio == plist_next(node)->prio); } static int __init plist_test(void) { int nr_expect = 0, i, loop; unsigned int r = local_clock(); printk(KERN_DEBUG "start plist test\n"); plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) plist_node_init(test_node + i, 0); for (loop = 0; loop < 1000; loop++) { r = r * 193939 % 47629; i = r % ARRAY_SIZE(test_node); if (plist_node_empty(test_node + i)) { r = r * 193939 % 47629; test_node[i].prio = r % 99; plist_add(test_node + i, &test_head); nr_expect++; } else { plist_del(test_node + i, &test_head); nr_expect--; } plist_test_check(nr_expect); if (!plist_node_empty(test_node + i)) { plist_test_requeue(test_node + i); plist_test_check(nr_expect); } } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) continue; plist_del(test_node + i, &test_head); nr_expect--; plist_test_check(nr_expect); } printk(KERN_DEBUG "end plist test\n"); /* Worst case test for plist_add() */ unsigned int test_data[241]; for (i = 0; i < ARRAY_SIZE(test_data); i++) test_data[i] = i; ktime_t start, end, time_elapsed = 0; plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) { plist_node_init(test_node + i, 0); test_node[i].prio = test_data[i]; } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) { start = ktime_get(); plist_add(test_node + i, &test_head); end = ktime_get(); time_elapsed += (end - start); } } pr_debug("plist_add worst case test time elapsed %lld\n", time_elapsed); return 0; } module_init(plist_test); #endif
150 150 150 150 150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0 /* Builtin firmware support */ #include <linux/firmware.h> #include "../firmware.h" /* Only if FW_LOADER=y */ #ifdef CONFIG_FW_LOADER struct builtin_fw { char *name; void *data; unsigned long size; }; extern struct builtin_fw __start_builtin_fw[]; extern struct builtin_fw __end_builtin_fw[]; static bool fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) { if (!buf) return true; if (size < fw->size) return false; memcpy(buf, fw->data, fw->size); return true; } /** * firmware_request_builtin() - load builtin firmware * @fw: pointer to firmware struct * @name: name of firmware file * * Some use cases in the kernel have a requirement so that no memory allocator * is involved as these calls take place early in boot process. An example is * the x86 CPU microcode loader. In these cases all the caller wants is to see * if the firmware was built-in and if so use it right away. This can be used * for such cases. * * This looks for the firmware in the built-in kernel. Only if the kernel was * built-in with the firmware you are looking for will this return successfully. * * Callers of this API do not need to use release_firmware() as the pointer to * the firmware is expected to be provided locally on the stack of the caller. **/ bool firmware_request_builtin(struct firmware *fw, const char *name) { struct builtin_fw *b_fw; if (!fw) return false; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { if (strcmp(name, b_fw->name) == 0) { fw->size = b_fw->size; fw->data = b_fw->data; return true; } } return false; } EXPORT_SYMBOL_NS_GPL(firmware_request_builtin, "TEST_FIRMWARE"); /** * firmware_request_builtin_buf() - load builtin firmware into optional buffer * @fw: pointer to firmware struct * @name: name of firmware file * @buf: If set this lets you use a pre-allocated buffer so that the built-in * firmware into is copied into. This field can be NULL. It is used by * callers such as request_firmware_into_buf() and * request_partial_firmware_into_buf() * @size: if buf was provided, the max size of the allocated buffer available. * If the built-in firmware does not fit into the pre-allocated @buf this * call will fail. * * This looks for the firmware in the built-in kernel. Only if the kernel was * built-in with the firmware you are looking for will this call possibly * succeed. If you passed a @buf the firmware will be copied into it *iff* the * built-in firmware fits into the pre-allocated buffer size specified in * @size. * * This caller is to be used internally by the firmware_loader only. **/ bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { if (!firmware_request_builtin(fw, name)) return false; return fw_copy_to_prealloc_buf(fw, buf, size); } bool firmware_is_builtin(const struct firmware *fw) { struct builtin_fw *b_fw; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) if (fw->data == b_fw->data) return true; return false; } #endif
99 1625 77 1625 1623 1173 14 1244 1145 1143 798 802 802 799 450 2 801 452 453 453 801 801 802 801 802 798 18 42 2 28 11 453 305 765 38 349 453 802 801 799 799 802 802 802 492 722 722 722 192 529 723 596 617 1173 1094 79 1 78 64 12 4 4 1 26 1143 1171 1709 2 1713 149 149 164 2 2 1 1 1 149 1 9 149 1 1 150 1 149 1712 1714 9 44 44 10 1141 1133 7 5 17 1 538 523 1 1 1 1 1 9 1 1 8 433 6 427 557 2 10 12 536 534 3 370 25 44 54 46 227 308 25 511 7 497 34 3 245 189 405 431 2 430 442 2 998 997 1 1 1 1 1 671 668 1 1026 2 1 673 138 213 8 2 1 1 6 3 1000 5 2 894 91 3 103 98 159 835 797 192 9 23 640 161 514 674 2 1 1 105 3 1 6 94 1 1 86 1 5 67 24 2 2 11 1 1 1 1 3 2 5 5 1 3 3 176 1 2 3 1 165 1 4 164 1 7 1 3 3 2 47 2 20 17 1 6 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/vmalloc.h> #include <linux/etherdevice.h> #include <linux/filter.h> #include <linux/rcupdate_trace.h> #include <linux/sched/signal.h> #include <net/bpf_sk_storage.h> #include <net/hotdata.h> #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> #include <net/page_pool/helpers.h> #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> #include <linux/netfilter.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include <net/netfilter/nf_bpf_link.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> struct bpf_test_timer { u32 i; u64 time_start, time_spent; }; static void bpf_test_timer_enter(struct bpf_test_timer *t) __acquires(rcu) { rcu_read_lock_dont_migrate(); t->time_start = ktime_get_ns(); } static void bpf_test_timer_leave(struct bpf_test_timer *t) __releases(rcu) { t->time_start = 0; rcu_read_unlock_migrate(); } static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, u32 repeat, int *err, u32 *duration) __must_hold(rcu) { t->i += iterations; if (t->i >= repeat) { /* We're done. */ t->time_spent += ktime_get_ns() - t->time_start; do_div(t->time_spent, t->i); *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; *err = 0; goto reset; } if (signal_pending(current)) { /* During iteration: we've been cancelled, abort. */ *err = -EINTR; goto reset; } if (need_resched()) { /* During iteration: we need to reschedule between runs. */ t->time_spent += ktime_get_ns() - t->time_start; bpf_test_timer_leave(t); cond_resched(); bpf_test_timer_enter(t); } /* Do another round. */ return true; reset: t->i = 0; return false; } /* We put this struct at the head of each page with a context and frame * initialised when the page is allocated, so we don't have to do this on each * repetition of the test run. */ struct xdp_page_head { struct xdp_buff orig_ctx; struct xdp_buff ctx; union { /* ::data_hard_start starts here */ DECLARE_FLEX_ARRAY(struct xdp_frame, frame); DECLARE_FLEX_ARRAY(u8, data); }; }; struct xdp_test_data { struct xdp_buff *orig_ctx; struct xdp_rxq_info rxq; struct net_device *dev; struct page_pool *pp; struct xdp_frame **frames; struct sk_buff **skbs; struct xdp_mem_info mem; u32 batch_size; u32 frame_cnt; }; /* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE * must be updated accordingly this gets changed, otherwise BPF selftests * will fail. */ #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head)) #define TEST_XDP_MAX_BATCH 256 static void xdp_test_run_init_page(netmem_ref netmem, void *arg) { struct xdp_page_head *head = phys_to_virt(page_to_phys(netmem_to_page(netmem))); struct xdp_buff *new_ctx, *orig_ctx; u32 headroom = XDP_PACKET_HEADROOM; struct xdp_test_data *xdp = arg; size_t frm_len, meta_len; struct xdp_frame *frm; void *data; orig_ctx = xdp->orig_ctx; frm_len = orig_ctx->data_end - orig_ctx->data_meta; meta_len = orig_ctx->data - orig_ctx->data_meta; headroom -= meta_len; new_ctx = &head->ctx; frm = head->frame; data = head->data; memcpy(data + headroom, orig_ctx->data_meta, frm_len); xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq); xdp_prepare_buff(new_ctx, data, headroom, frm_len, true); new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); frm->mem_type = new_ctx->rxq->mem.type; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx) { struct page_pool *pp; int err = -ENOMEM; struct page_pool_params pp_params = { .order = 0, .flags = 0, .pool_size = xdp->batch_size, .nid = NUMA_NO_NODE, .init_callback = xdp_test_run_init_page, .init_arg = xdp, }; xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->frames) return -ENOMEM; xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->skbs) goto err_skbs; pp = page_pool_create(&pp_params); if (IS_ERR(pp)) { err = PTR_ERR(pp); goto err_pp; } /* will copy 'mem.id' into pp->xdp_mem_id */ err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp); if (err) goto err_mmodel; xdp->pp = pp; /* We create a 'fake' RXQ referencing the original dev, but with an * xdp_mem_info pointing to our page_pool */ xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0); xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL; xdp->rxq.mem.id = pp->xdp_mem_id; xdp->dev = orig_ctx->rxq->dev; xdp->orig_ctx = orig_ctx; return 0; err_mmodel: page_pool_destroy(pp); err_pp: kvfree(xdp->skbs); err_skbs: kvfree(xdp->frames); return err; } static void xdp_test_run_teardown(struct xdp_test_data *xdp) { xdp_unreg_mem_model(&xdp->mem); page_pool_destroy(xdp->pp); kfree(xdp->frames); kfree(xdp->skbs); } static bool frame_was_changed(const struct xdp_page_head *head) { /* xdp_scrub_frame() zeroes the data pointer, flags is the last field, * i.e. has the highest chances to be overwritten. If those two are * untouched, it's most likely safe to skip the context reset. */ return head->frame->data != head->orig_ctx.data || head->frame->flags != head->orig_ctx.flags; } static bool ctx_was_changed(struct xdp_page_head *head) { return head->orig_ctx.data != head->ctx.data || head->orig_ctx.data_meta != head->ctx.data_meta || head->orig_ctx.data_end != head->ctx.data_end; } static void reset_ctx(struct xdp_page_head *head) { if (likely(!frame_was_changed(head) && !ctx_was_changed(head))) return; head->ctx.data = head->orig_ctx.data; head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); head->frame->mem_type = head->orig_ctx.rxq->mem.type; } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, struct sk_buff **skbs, struct net_device *dev) { gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; int i, n; LIST_HEAD(list); n = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes, (void **)skbs); if (unlikely(n == 0)) { for (i = 0; i < nframes; i++) xdp_return_frame(frames[i]); return -ENOMEM; } for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(xdpf, skb, dev); if (!skb) { xdp_return_frame(xdpf); continue; } list_add_tail(&skb->list, &list); } netif_receive_skb_list(&list); return 0; } static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, u32 repeat) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int err = 0, act, ret, i, nframes = 0, batch_sz; struct xdp_frame **frames = xdp->frames; struct bpf_redirect_info *ri; struct xdp_page_head *head; struct xdp_frame *frm; bool redirect = false; struct xdp_buff *ctx; struct page *page; batch_sz = min_t(u32, repeat, xdp->batch_size); local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); ri = bpf_net_ctx_get_ri(); xdp_set_return_frame_no_direct(); for (i = 0; i < batch_sz; i++) { page = page_pool_dev_alloc_pages(xdp->pp); if (!page) { err = -ENOMEM; goto out; } head = phys_to_virt(page_to_phys(page)); reset_ctx(head); ctx = &head->ctx; frm = head->frame; xdp->frame_cnt++; act = bpf_prog_run_xdp(prog, ctx); /* if program changed pkt bounds we need to update the xdp_frame */ if (unlikely(ctx_was_changed(head))) { ret = xdp_update_frame_from_buff(ctx, frm); if (ret) { xdp_return_buff(ctx); continue; } } switch (act) { case XDP_TX: /* we can't do a real XDP_TX since we're not in the * driver, so turn it into a REDIRECT back to the same * index */ ri->tgt_index = xdp->dev->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; fallthrough; case XDP_REDIRECT: redirect = true; ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog); if (ret) xdp_return_buff(ctx); break; case XDP_PASS: frames[nframes++] = frm; break; default: bpf_warn_invalid_xdp_action(NULL, prog, act); fallthrough; case XDP_DROP: xdp_return_buff(ctx); break; } } out: if (redirect) xdp_do_flush(); if (nframes) { ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev); if (ret) err = ret; } xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return err; } static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, u32 repeat, u32 batch_size, u32 *time) { struct xdp_test_data xdp = { .batch_size = batch_size }; struct bpf_test_timer t = {}; int ret; if (!repeat) repeat = 1; ret = xdp_test_run_setup(&xdp, ctx); if (ret) return ret; bpf_test_timer_enter(&t); do { xdp.frame_cnt = 0; ret = xdp_test_run_batch(&xdp, prog, repeat - t.i); if (unlikely(ret < 0)) break; } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time)); bpf_test_timer_leave(&t); xdp_test_run_teardown(&xdp); return ret; } static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = {}; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(item.cgroup_storage[stype])) { item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } if (!repeat) repeat = 1; bpf_test_timer_enter(&t); old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { run_ctx.prog_item = &item; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, struct skb_shared_info *sinfo, u32 size, u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; u32 copy_size = size; /* Clamp copy if the user has provided a size hint, but copy the full * buffer if not to retain old behaviour. */ if (kattr->test.data_size_out && copy_size > kattr->test.data_size_out) { copy_size = kattr->test.data_size_out; err = -ENOSPC; } if (data_out) { int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; if (len < 0) { err = -ENOSPC; goto out; } if (copy_to_user(data_out, data, len)) goto out; if (sinfo) { int i, offset = len; u32 data_len; for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; if (offset >= copy_size) { err = -ENOSPC; break; } data_len = min_t(u32, copy_size - offset, skb_frag_size(frag)); if (copy_to_user(data_out + offset, skb_frag_address(frag), data_len)) goto out; offset += data_len; } } } if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) goto out; if (err != -ENOSPC) err = 0; out: trace_bpf_test_finish(&err); return err; } /* Integer types of various sizes and pointer combinations cover variety of * architecture dependent calling conventions. 7+ can be supported in the * future. */ __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_fentry_test1(int a) { return a + 1; } EXPORT_SYMBOL_GPL(bpf_fentry_test1); noinline int bpf_fentry_test2(int a, u64 b) { return a + b; } noinline int bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } noinline int bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } noinline int bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } noinline int bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; noinline int bpf_fentry_test7(struct bpf_fentry_test_t *arg) { asm volatile ("" : "+r"(arg)); return (long)arg; } noinline int bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } __bpf_kfunc u32 bpf_fentry_test9(u32 *a) { return *a; } noinline int bpf_fentry_test10(const void *a) { return (long)a; } noinline void bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, void *e, char f, int g) { *b += 1; return a + *b + c + d + (long)e + f + g; } __bpf_kfunc int bpf_modify_return_test_tp(int nonce) { trace_bpf_trigger_tp(nonce); return nonce; } noinline int bpf_fentry_shadow_test(int a) { return a + 1; } struct prog_test_member1 { int a; }; struct prog_test_member { struct prog_test_member1 m; int c; }; struct prog_test_ref_kfunc { int a; int b; struct prog_test_member memb; struct prog_test_ref_kfunc *next; refcount_t cnt; }; __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) { refcount_dec(&p->cnt); } __bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p) { bpf_kfunc_call_test_release(p); } CFI_NOSEAL(bpf_kfunc_call_test_release_dtor); __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } __bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p) { } CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_test_modify_return_ids) static const struct btf_kfunc_id_set bpf_test_modify_return_set = { .owner = THIS_MODULE, .set = &bpf_test_modify_return_ids, }; BTF_KFUNCS_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) BTF_KFUNCS_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; if (user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } return data; } int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_fentry_test_t arg = {}; u16 side_effect = 0, ret = 0; int b = 2, err = -EFAULT; u32 retval = 0; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || bpf_fentry_test8(&arg) != 0 || bpf_fentry_test9(&retval) != 0 || bpf_fentry_test10((void *)0) != 0) goto out; break; case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) side_effect++; b = 2; ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); if (b != 2) side_effect++; break; default: goto out; } retval = ((u32)side_effect << 16) | ret; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; err = 0; out: trace_bpf_test_finish(&err); return err; } struct bpf_raw_tp_test_run_info { struct bpf_prog *prog; void *ctx; u32 retval; }; static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; struct bpf_trace_run_ctx run_ctx = {}; struct bpf_run_ctx *old_run_ctx; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); rcu_read_lock(); info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); bpf_reset_run_ctx(old_run_ctx); } int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; struct bpf_raw_tp_test_run_info info; int cpu = kattr->test.cpu, err = 0; int current_cpu; /* doesn't support data_in/out, ctx_out, duration, or repeat */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) return -EINVAL; if (ctx_size_in) { info.ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(info.ctx)) return PTR_ERR(info.ctx); } else { info.ctx = NULL; } info.prog = prog; current_cpu = get_cpu(); if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || cpu == current_cpu) { __bpf_prog_test_run_raw_tp(&info); } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { /* smp_call_function_single() also checks cpu_online() * after csd_lock(). However, since cpu is from user * space, let's do an extra quick check to filter out * invalid value before smp_call_function_single(). */ err = -ENXIO; } else { err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, &info, 1); } put_cpu(); if (!err && copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) err = -EFAULT; kfree(info.ctx); return err; } static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) { void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); u32 size = kattr->test.ctx_size_in; void *data; int err; if (!data_in && !data_out) return NULL; data = kzalloc(max_size, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (data_in) { err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size); if (err) { kfree(data); return ERR_PTR(err); } size = min_t(u32, max_size, size); if (copy_from_user(data, data_in, size)) { kfree(data); return ERR_PTR(-EFAULT); } } return data; } static int bpf_ctx_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, u32 size) { void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); int err = -EFAULT; u32 copy_size = size; if (!data || !data_out) return 0; if (copy_size > kattr->test.ctx_size_out) { copy_size = kattr->test.ctx_size_out; err = -ENOSPC; } if (copy_to_user(data_out, data, copy_size)) goto out; if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) goto out; if (err != -ENOSPC) err = 0; out: return err; } /** * range_is_zero - test whether buffer is initialized * @buf: buffer to check * @from: check from this position * @to: check up until (excluding) this position * * This function returns true if the there is a non-zero byte * in the buf in the range [from,to). */ static inline bool range_is_zero(void *buf, size_t from, size_t to) { return !memchr_inv((u8 *)buf + from, 0, to - from); } static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return 0; /* make sure the fields we don't use are zeroed */ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) return -EINVAL; /* mark is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ /* ingress_ifindex is allowed */ /* ifindex is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), offsetof(struct __sk_buff, tstamp))) return -EINVAL; /* tstamp is allowed */ /* wire_len is allowed */ /* gso_segs is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), offsetof(struct __sk_buff, gso_size))) return -EINVAL; /* gso_size is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), offsetof(struct __sk_buff, hwtstamp))) return -EINVAL; /* hwtstamp is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, hwtstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->mark = __skb->mark; skb->priority = __skb->priority; skb->skb_iif = __skb->ingress_ifindex; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); if (__skb->wire_len == 0) { cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; return 0; } static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return; __skb->mark = skb->mark; __skb->priority = skb->priority; __skb->ingress_ifindex = skb->skb_iif; __skb->ifindex = skb->dev->ifindex; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); __skb->wire_len = cb->pkt_len; __skb->gso_segs = skb_shinfo(skb)->gso_segs; __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp; } static struct proto bpf_dummy_proto = { .name = "bpf_dummy", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), }; int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; struct sock *sk; void *data; int ret; if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < ETH_HLEN) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); if (IS_ERR(ctx)) { kfree(data); return PTR_ERR(ctx); } switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: is_l2 = true; fallthrough; case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_CGROUP_SKB: is_direct_pkt_access = true; break; default: break; } sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } sock_init_data(NULL, sk); skb = slab_build_skb(data); if (!skb) { kfree(data); kfree(ctx); sk_free(sk); return -ENOMEM; } skb->sk = sk; skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); if (ctx && ctx->ifindex > 1) { dev = dev_get_by_index(net, ctx->ifindex); if (!dev) { ret = -ENODEV; goto out; } } skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); switch (skb->protocol) { case htons(ETH_P_IP): sk->sk_family = AF_INET; if (sizeof(struct iphdr) <= skb_headlen(skb)) { sk->sk_rcv_saddr = ip_hdr(skb)->saddr; sk->sk_daddr = ip_hdr(skb)->daddr; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): sk->sk_family = AF_INET6; if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) { sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr; sk->sk_v6_daddr = ipv6_hdr(skb)->daddr; } break; #endif default: break; } if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) { const int off = skb_network_offset(skb); int len = skb->len - off; skb->csum = skb_checksum(skb, off, len, 0); skb->ip_summed = CHECKSUM_COMPLETE; } ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { ret = -ENOMEM; goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) { const int off = skb_network_offset(skb); int len = skb->len - off; __wsum csum; csum = skb_checksum(skb, off, len, 0); if (csum_fold(skb->csum) != csum_fold(csum)) { ret = -EBADMSG; goto out; } } convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); sk_free(sk); kfree(ctx); return ret; } static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp) { unsigned int ingress_ifindex, rx_queue_index; struct netdev_rx_queue *rxqueue; struct net_device *device; if (!xdp_md) return 0; if (xdp_md->egress_ifindex != 0) return -EINVAL; ingress_ifindex = xdp_md->ingress_ifindex; rx_queue_index = xdp_md->rx_queue_index; if (!ingress_ifindex && rx_queue_index) return -EINVAL; if (ingress_ifindex) { device = dev_get_by_index(current->nsproxy->net_ns, ingress_ifindex); if (!device) return -ENODEV; if (rx_queue_index >= device->real_num_rx_queues) goto free_dev; rxqueue = __netif_get_rx_queue(device, rx_queue_index); if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq)) goto free_dev; xdp->rxq = &rxqueue->xdp_rxq; /* The device is now tracked in the xdp->rxq for later * dev_put() */ } xdp->data = xdp->data_meta + xdp_md->data; return 0; free_dev: dev_put(device); return -EINVAL; } static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) { if (!xdp_md) return; xdp_md->data = xdp->data - xdp->data_meta; xdp_md->data_end = xdp->data_end - xdp->data_meta; if (xdp_md->ingress_ifindex) dev_put(xdp->rxq->dev); } int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); u32 retval = 0, meta_sz = 0, duration, max_linear_sz, size; u32 linear_sz = kattr->test.data_size_in; u32 batch_size = kattr->test.batch_size; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct skb_shared_info *sinfo; struct xdp_buff xdp = {}; int i, ret = -EINVAL; struct xdp_md *ctx; void *data; if (prog->expected_attach_type == BPF_XDP_DEVMAP || prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) return -EINVAL; if (bpf_prog_is_dev_bound(prog->aux)) return -EINVAL; if (do_live) { if (!batch_size) batch_size = NAPI_POLL_WEIGHT; else if (batch_size > TEST_XDP_MAX_BATCH) return -E2BIG; headroom += sizeof(struct xdp_page_head); } else if (batch_size) { return -EINVAL; } ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (ctx) { /* There can't be user provided data before the meta data */ if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; meta_sz = ctx->data; linear_sz = ctx->data_end; } max_linear_sz = PAGE_SIZE - headroom - tailroom; linear_sz = min_t(u32, linear_sz, max_linear_sz); /* disallow live data mode for jumbo frames */ if (do_live && kattr->test.data_size_in > linear_sz) goto free_ctx; if (kattr->test.data_size_in - meta_sz < ETH_HLEN) goto free_ctx; data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); rxqueue->xdp_rxq.frag_size = PAGE_SIZE; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, linear_sz, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; size = linear_sz; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); while (size < kattr->test.data_size_in) { struct page *page; skb_frag_t *frag; u32 data_len; if (sinfo->nr_frags == MAX_SKB_FRAGS) { ret = -ENOMEM; goto out; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } frag = &sinfo->frags[sinfo->nr_frags++]; data_len = min_t(u32, kattr->test.data_size_in - size, PAGE_SIZE); skb_frag_fill_page_desc(frag, page, 0, data_len); if (copy_from_user(page_address(page), data_in + size, data_len)) { ret = -EFAULT; goto out; } sinfo->xdp_frags_size += data_len; size += data_len; } xdp_buff_set_frags_flag(&xdp); } if (repeat > 1) bpf_prog_change_xdp(NULL, prog); if (do_live) ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration); else ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented * even if the test run failed. */ xdp_convert_buff_to_md(&xdp, ctx); if (ret) goto out; size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct xdp_md)); out: if (repeat > 1) bpf_prog_change_xdp(prog, NULL); free_data: for (i = 0; i < sinfo->nr_frags; i++) __free_page(skb_frag_page(&sinfo->frags[i])); kfree(data); free_ctx: kfree(ctx); return ret; } static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) { /* make sure the fields we don't use are zeroed */ if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) return -EINVAL; /* flags is allowed */ if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), sizeof(struct bpf_flow_keys))) return -EINVAL; return 0; } int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = {}; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; const struct ethhdr *eth; unsigned int flags = 0; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < ETH_HLEN) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0); if (IS_ERR(data)) return PTR_ERR(data); eth = (struct ethhdr *)data; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_user_bpf_flow_keys(user_ctx); if (ret) goto out; flags = user_ctx->flags; } ctx.flow_keys = &flow_keys; ctx.data = data; ctx.data_end = (__u8 *)data + size; bpf_test_timer_enter(&t); do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, sizeof(flow_keys), retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); out: kfree(user_ctx); kfree(data); return ret; } int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = {}; struct bpf_prog_array *progs = NULL; struct bpf_sk_lookup_kern ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_sk_lookup *user_ctx; u32 retval, duration; int ret = -EINVAL; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || kattr->test.data_size_out) return -EINVAL; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); if (IS_ERR(user_ctx)) return PTR_ERR(user_ctx); if (!user_ctx) return -EINVAL; if (user_ctx->sk) goto out; if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) goto out; if (user_ctx->local_port > U16_MAX) { ret = -ERANGE; goto out; } ctx.family = (u16)user_ctx->family; ctx.protocol = (u16)user_ctx->protocol; ctx.dport = (u16)user_ctx->local_port; ctx.sport = user_ctx->remote_port; switch (ctx.family) { case AF_INET: ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; break; #endif default: ret = -EAFNOSUPPORT; goto out; } progs = bpf_prog_array_alloc(1, GFP_KERNEL); if (!progs) { ret = -ENOMEM; goto out; } progs->items[0].prog = prog; bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; user_ctx->cookie = 0; if (ctx.selected_sk) { if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { ret = -EOPNOTSUPP; goto out; } user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); out: bpf_prog_array_free(progs); kfree(user_ctx); return ret; } int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; void *ctx = NULL; u32 retval; int err = 0; /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.flags || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > U16_MAX) return -EINVAL; if (ctx_size_in) { ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(ctx)) return PTR_ERR(ctx); } rcu_read_lock_trace(); retval = bpf_prog_run_pin_on_cpu(prog, ctx); rcu_read_unlock_trace(); if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { err = -EFAULT; goto out; } if (ctx_size_in) if (copy_to_user(ctx_in, ctx, ctx_size_in)) err = -EFAULT; out: kfree(ctx); return err; } static int verify_and_copy_hook_state(struct nf_hook_state *state, const struct nf_hook_state *user, struct net_device *dev) { if (user->in || user->out) return -EINVAL; if (user->net || user->sk || user->okfn) return -EINVAL; switch (user->pf) { case NFPROTO_IPV4: case NFPROTO_IPV6: switch (state->hook) { case NF_INET_PRE_ROUTING: state->in = dev; break; case NF_INET_LOCAL_IN: state->in = dev; break; case NF_INET_FORWARD: state->in = dev; state->out = dev; break; case NF_INET_LOCAL_OUT: state->out = dev; break; case NF_INET_POST_ROUTING: state->out = dev; break; } break; default: return -EINVAL; } state->pf = user->pf; state->hook = user->hook; return 0; } static __be16 nfproto_eth(int nfproto) { switch (nfproto) { case NFPROTO_IPV4: return htons(ETH_P_IP); case NFPROTO_IPV6: break; } return htons(ETH_P_IPV6); } int bpf_prog_test_run_nf(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; struct nf_hook_state *user_ctx, hook_state = { .pf = NFPROTO_IPV4, .hook = NF_INET_LOCAL_OUT, }; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct bpf_nf_ctx ctx = { .state = &hook_state, }; struct sk_buff *skb = NULL; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < sizeof(struct iphdr)) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev); if (ret) goto out; } skb = slab_build_skb(data); if (!skb) { ret = -ENOMEM; goto out; } data = NULL; /* data released via kfree_skb */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); ret = -EINVAL; if (hook_state.hook != NF_INET_LOCAL_OUT) { if (size < ETH_HLEN + sizeof(struct iphdr)) goto out; skb->protocol = eth_type_trans(skb, dev); switch (skb->protocol) { case htons(ETH_P_IP): if (hook_state.pf == NFPROTO_IPV4) break; goto out; case htons(ETH_P_IPV6): if (size < ETH_HLEN + sizeof(struct ipv6hdr)) goto out; if (hook_state.pf == NFPROTO_IPV6) break; goto out; default: ret = -EPROTO; goto out; } skb_reset_network_header(skb); } else { skb->protocol = nfproto_eth(hook_state.pf); } ctx.skb = skb; ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false); if (ret) goto out; ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); out: kfree(user_ctx); kfree_skb(skb); kfree(data); return ret; } static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { .owner = THIS_MODULE, .set = &test_sk_check_kfunc_ids, }; BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(func, bpf_kfunc_call_test_release_dtor) BTF_ID(struct, prog_test_member) BTF_ID(func, bpf_kfunc_call_memb_release_dtor) static int __init bpf_prog_test_run_init(void) { const struct btf_id_dtor_kfunc bpf_prog_test_dtor_kfunc[] = { { .btf_id = bpf_prog_test_dtor_kfunc_ids[0], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[1] }, { .btf_id = bpf_prog_test_dtor_kfunc_ids[2], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[3], }, }; int ret; ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, ARRAY_SIZE(bpf_prog_test_dtor_kfunc), THIS_MODULE); } late_initcall(bpf_prog_test_run_init);
710 790 710 1336 832 750 679 295 374 659 168 168 533 660 660 294 374 660 1465 1472 1185 126 126 126 125 125 126 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 // SPDX-License-Identifier: GPL-2.0-only /* * Network node table * * SELinux must keep a mapping of network nodes to labels/SIDs. This * mapping is maintained as part of the normal policy but a fast cache is * needed to reduce the lookup overhead since most of these queries happen on * a per-packet basis. * * Author: Paul Moore <paul@paul-moore.com> * * This code is heavily based on the "netif" concept originally developed by * James Morris <jmorris@redhat.com> * (see security/selinux/netif.c for more information) */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip.h> #include <net/ipv6.h> #include "netnode.h" #include "objsec.h" #define SEL_NETNODE_HASH_SIZE 256 #define SEL_NETNODE_HASH_BKT_LIMIT 16 struct sel_netnode_bkt { unsigned int size; struct list_head list; }; struct sel_netnode { struct netnode_security_struct nsec; struct list_head list; struct rcu_head rcu; }; /* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason * for this is that I suspect most users will not make heavy use of both * address families at the same time so one table will usually end up wasted, * if this becomes a problem we can always add a hash table for each address * family later */ static DEFINE_SPINLOCK(sel_netnode_lock); static struct sel_netnode_bkt sel_netnode_hash[SEL_NETNODE_HASH_SIZE]; /** * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table * @addr: IPv4 address * * Description: * This is the IPv4 hashing function for the node interface table, it returns * the bucket number for the given IP address. * */ static unsigned int sel_netnode_hashfn_ipv4(__be32 addr) { /* at some point we should determine if the mismatch in byte order * affects the hash function dramatically */ return (addr & (SEL_NETNODE_HASH_SIZE - 1)); } /** * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table * @addr: IPv6 address * * Description: * This is the IPv6 hashing function for the node interface table, it returns * the bucket number for the given IP address. * */ static unsigned int sel_netnode_hashfn_ipv6(const struct in6_addr *addr) { /* just hash the least significant 32 bits to keep things fast (they * are the most likely to be different anyway), we can revisit this * later if needed */ return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1)); } /** * sel_netnode_find - Search for a node record * @addr: IP address * @family: address family * * Description: * Search the network node table and return the record matching @addr. If an * entry can not be found in the table return NULL. * */ static struct sel_netnode *sel_netnode_find(const void *addr, u16 family) { unsigned int idx; struct sel_netnode *node; switch (family) { case PF_INET: idx = sel_netnode_hashfn_ipv4(*(const __be32 *)addr); break; case PF_INET6: idx = sel_netnode_hashfn_ipv6(addr); break; default: BUG(); return NULL; } list_for_each_entry_rcu(node, &sel_netnode_hash[idx].list, list) if (node->nsec.family == family) switch (family) { case PF_INET: if (node->nsec.addr.ipv4 == *(const __be32 *)addr) return node; break; case PF_INET6: if (ipv6_addr_equal(&node->nsec.addr.ipv6, addr)) return node; break; } return NULL; } /** * sel_netnode_insert - Insert a new node into the table * @node: the new node record * * Description: * Add a new node record to the network address hash table. * */ static void sel_netnode_insert(struct sel_netnode *node) { unsigned int idx; switch (node->nsec.family) { case PF_INET: idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4); break; case PF_INET6: idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6); break; default: BUG(); return; } /* we need to impose a limit on the growth of the hash table so check * this bucket to make sure it is within the specified bounds */ list_add_rcu(&node->list, &sel_netnode_hash[idx].list); if (sel_netnode_hash[idx].size == SEL_NETNODE_HASH_BKT_LIMIT) { struct sel_netnode *tail; tail = list_entry( rcu_dereference_protected( list_tail_rcu(&sel_netnode_hash[idx].list), lockdep_is_held(&sel_netnode_lock)), struct sel_netnode, list); list_del_rcu(&tail->list); kfree_rcu(tail, rcu); } else sel_netnode_hash[idx].size++; } /** * sel_netnode_sid_slow - Lookup the SID of a network address using the policy * @addr: the IP address * @family: the address family * @sid: node SID * * Description: * This function determines the SID of a network address by querying the * security policy. The result is added to the network address table to * speedup future queries. Returns zero on success, negative values on * failure. * */ static int sel_netnode_sid_slow(const void *addr, u16 family, u32 *sid) { int ret; struct sel_netnode *node; struct sel_netnode *new; spin_lock_bh(&sel_netnode_lock); node = sel_netnode_find(addr, family); if (node != NULL) { *sid = node->nsec.sid; spin_unlock_bh(&sel_netnode_lock); return 0; } /* If this memory allocation fails still return 0. The SID * is valid, it just won't be added to the cache. */ new = kmalloc(sizeof(*new), GFP_ATOMIC); switch (family) { case PF_INET: ret = security_node_sid(PF_INET, addr, sizeof(struct in_addr), sid); if (new) new->nsec.addr.ipv4 = *(const __be32 *)addr; break; case PF_INET6: ret = security_node_sid(PF_INET6, addr, sizeof(struct in6_addr), sid); if (new) new->nsec.addr.ipv6 = *(const struct in6_addr *)addr; break; default: BUG(); ret = -EINVAL; } if (ret == 0 && new) { new->nsec.family = family; new->nsec.sid = *sid; sel_netnode_insert(new); } else kfree(new); spin_unlock_bh(&sel_netnode_lock); if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network node label\n", __func__); return ret; } /** * sel_netnode_sid - Lookup the SID of a network address * @addr: the IP address * @family: the address family * @sid: node SID * * Description: * This function determines the SID of a network address using the fastest * method possible. First the address table is queried, but if an entry * can't be found then the policy is queried and the result is added to the * table to speedup future queries. Returns zero on success, negative values * on failure. * */ int sel_netnode_sid(const void *addr, u16 family, u32 *sid) { struct sel_netnode *node; rcu_read_lock(); node = sel_netnode_find(addr, family); if (likely(node != NULL)) { *sid = node->nsec.sid; rcu_read_unlock(); return 0; } rcu_read_unlock(); return sel_netnode_sid_slow(addr, family, sid); } /** * sel_netnode_flush - Flush the entire network address table * * Description: * Remove all entries from the network address table. * */ void sel_netnode_flush(void) { unsigned int idx; struct sel_netnode *node, *node_tmp; spin_lock_bh(&sel_netnode_lock); for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++) { list_for_each_entry_safe(node, node_tmp, &sel_netnode_hash[idx].list, list) { list_del_rcu(&node->list); kfree_rcu(node, rcu); } sel_netnode_hash[idx].size = 0; } spin_unlock_bh(&sel_netnode_lock); } static __init int sel_netnode_init(void) { int iter; if (!selinux_enabled_boot) return 0; for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++) { INIT_LIST_HEAD(&sel_netnode_hash[iter].list); sel_netnode_hash[iter].size = 0; } return 0; } __initcall(sel_netnode_init);
7 5 5 5 3 1 2 6 1 5 5 5 5 5 44 44 7 2 36 42 4 4 4 1 1 2 2 44 1 1 42 30 42 42 42 42 40 42 8 1 1 1 2 1 2 2 5 1 4 1 2 1 3 6 1 5 5 7 7 7 5 5 5 5 5 13 13 5 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/uaccess.h> #include <linux/debugfs.h> #include <linux/caif/caif_socket.h> #include <linux/pkt_sched.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol socket support (AF_CAIF)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_CAIF); /* * CAIF state is re-using the TCP socket states. * caif_states stored in sk_state reflect the state as reported by * the CAIF stack, while sk_socket->state is the state of the socket. */ enum caif_states { CAIF_CONNECTED = TCP_ESTABLISHED, CAIF_CONNECTING = TCP_SYN_SENT, CAIF_DISCONNECTED = TCP_CLOSE }; #define TX_FLOW_ON_BIT 1 #define RX_FLOW_ON_BIT 2 struct caifsock { struct sock sk; /* must be first member */ struct cflayer layer; unsigned long flow_state; struct caif_connect_request conn_req; struct mutex readlock; struct dentry *debugfs_socket_dir; int headroom, tailroom, maxframe; }; static int rx_flow_is_on(struct caifsock *cf_sk) { return test_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static int tx_flow_is_on(struct caifsock *cf_sk) { return test_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_rx_flow_off(struct caifsock *cf_sk) { clear_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_rx_flow_on(struct caifsock *cf_sk) { set_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_tx_flow_off(struct caifsock *cf_sk) { clear_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_tx_flow_on(struct caifsock *cf_sk) { set_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void caif_read_lock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_lock(&cf_sk->readlock); } static void caif_read_unlock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_unlock(&cf_sk->readlock); } static int sk_rcvbuf_lowwater(struct caifsock *cf_sk) { /* A quarter of full buffer is used a low water mark */ return cf_sk->sk.sk_rcvbuf / 4; } static void caif_flow_ctrl(struct sock *sk, int mode) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd) cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode); } /* * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are * not dropped, but CAIF is sending flow off instead. */ static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); bool queued = false; if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { net_dbg_ratelimited("sending flow OFF (queue len = %d %d)\n", atomic_read(&cf_sk->sk.sk_rmem_alloc), sk_rcvbuf_lowwater(cf_sk)); set_rx_flow_off(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } err = sk_filter(sk, skb); if (err) goto out; if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } skb->dev = NULL; skb_set_owner_r(skb, sk); spin_lock_irqsave(&list->lock, flags); queued = !sock_flag(sk, SOCK_DEAD); if (queued) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); out: if (queued) sk->sk_data_ready(sk); else kfree_skb(skb); } /* Packet Receive Callback function called from CAIF Stack */ static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct caifsock *cf_sk; struct sk_buff *skb; cf_sk = container_of(layr, struct caifsock, layer); skb = cfpkt_tonative(pkt); if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) { kfree_skb(skb); return 0; } caif_queue_rcv_skb(&cf_sk->sk, skb); return 0; } static void cfsk_hold(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_hold(&cf_sk->sk); } static void cfsk_put(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_put(&cf_sk->sk); } /* Packet Control Callback function called from CAIF */ static void caif_ctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); switch (flow) { case CAIF_CTRLCMD_FLOW_ON_IND: /* OK from modem to start sending again */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_FLOW_OFF_IND: /* Modem asks us to shut up */ set_tx_flow_off(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_RSP: /* We're now connected */ caif_client_register_refcnt(&cf_sk->layer, cfsk_hold, cfsk_put); cf_sk->sk.sk_state = CAIF_CONNECTED; set_tx_flow_on(cf_sk); cf_sk->sk.sk_shutdown = 0; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_DEINIT_RSP: /* We're now disconnected */ cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_FAIL_RSP: /* Connect request failed */ cf_sk->sk.sk_err = ECONNREFUSED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; /* * Socket "standards" seems to require POLLOUT to * be set at connect failure. */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: /* Modem has closed this connection, or device is down. */ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; cf_sk->sk.sk_err = ECONNRESET; set_rx_flow_on(cf_sk); sk_error_report(&cf_sk->sk); break; default: pr_debug("Unexpected flow command %d\n", flow); } } static void caif_check_flow_release(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (rx_flow_is_on(cf_sk)) return; if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) { set_rx_flow_on(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ); } } /* * Copied from unix_dgram_recvmsg, but removed credit checks, * changed locking, address handling and added MSG_TRUNC. */ static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int ret; int copylen; ret = -EOPNOTSUPP; if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) goto read_error; copylen = skb->len; if (len < copylen) { m->msg_flags |= MSG_TRUNC; copylen = len; } ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; ret = (flags & MSG_TRUNC) ? skb->len : copylen; out_free: skb_free_datagram(sk, skb); caif_check_flow_release(sk); return ret; read_error: return ret; } /* Copied from unix_stream_wait_data, identical except for lock call. */ static long caif_stream_data_wait(struct sock *sk, long timeo) { DEFINE_WAIT(wait); lock_sock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue) || sk->sk_err || sk->sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); release_sock(sk); return timeo; } /* * Copied from unix_stream_recvmsg, but removed credit checks, * changed locking calls, changed address handling. */ static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int copied = 0; int target; int err = 0; long timeo; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; /* * Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ err = -EAGAIN; if (sk->sk_state == CAIF_CONNECTING) goto out; caif_read_lock(sk); target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); do { int chunk; struct sk_buff *skb; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } skb = skb_dequeue(&sk->sk_receive_queue); caif_check_flow_release(sk); if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; err = -ECONNRESET; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; err = -EPIPE; if (sk->sk_state != CAIF_CONNECTED) goto unlock; if (sock_flag(sk, SOCK_DEAD)) goto unlock; release_sock(sk); err = -EAGAIN; if (!timeo) break; caif_read_unlock(sk); timeo = caif_stream_data_wait(sk, timeo); if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } caif_read_lock(sk); continue; unlock: release_sock(sk); break; } release_sock(sk); chunk = min_t(unsigned int, skb->len, size); if (memcpy_to_msg(msg, skb->data, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { skb_pull(skb, chunk); /* put the skb back if we didn't use it up. */ if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); break; } kfree_skb(skb); } else { /* * It is questionable, see note in unix_dgram_recvmsg. */ /* put message back and return */ skb_queue_head(&sk->sk_receive_queue, skb); break; } } while (size); caif_read_unlock(sk); out: return copied ? : err; } /* * Copied from sock.c:sock_wait_for_wmem, but change to wait for * CAIF flow-on and sock_writable. */ static long caif_wait_for_flow_on(struct caifsock *cf_sk, int wait_writeable, long timeo, int *err) { struct sock *sk = &cf_sk->sk; DEFINE_WAIT(wait); for (;;) { *err = 0; if (tx_flow_is_on(cf_sk) && (!wait_writeable || sock_writeable(&cf_sk->sk))) break; *err = -ETIMEDOUT; if (!timeo) break; *err = -ERESTARTSYS; if (signal_pending(current)) break; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); *err = -ECONNRESET; if (sk->sk_shutdown & SHUTDOWN_MASK) break; *err = -sk->sk_err; if (sk->sk_err) break; *err = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Transmit a SKB. The device may temporarily request re-transmission * by returning EAGAIN. */ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, int noblock, long timeo) { struct cfpkt *pkt; pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); cfpkt_set_prio(pkt, cf_sk->sk.sk_priority); if (cf_sk->layer.dn == NULL) { kfree_skb(skb); return -EINVAL; } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } /* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int buffer_size; int ret = 0; struct sk_buff *skb = NULL; int noblock; long timeo; caif_assert(cf_sk); ret = sock_error(sk); if (ret) goto err; ret = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto err; ret = -EOPNOTSUPP; if (msg->msg_namelen) goto err; noblock = msg->msg_flags & MSG_DONTWAIT; timeo = sock_sndtimeo(sk, noblock); timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk), 1, timeo, &ret); if (ret) goto err; ret = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) goto err; /* Error if trying to write more than maximum frame size. */ ret = -EMSGSIZE; if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM) goto err; buffer_size = len + cf_sk->headroom + cf_sk->tailroom; ret = -ENOMEM; skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret); if (!skb || skb_tailroom(skb) < buffer_size) goto err; skb_reserve(skb, cf_sk->headroom); ret = memcpy_from_msg(skb_put(skb, len), msg, len); if (ret) goto err; ret = transmit_skb(skb, cf_sk, noblock, timeo); if (ret < 0) /* skb is already freed */ return ret; return len; err: kfree_skb(skb); return ret; } /* * Copied from unix_stream_sendmsg and adapted to CAIF: * Changed removed permission handling and added waiting for flow on * and other minor adaptations. */ static int caif_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int err, size; struct sk_buff *skb; int sent = 0; long timeo; err = -EOPNOTSUPP; if (unlikely(msg->msg_flags&MSG_OOB)) goto out_err; if (unlikely(msg->msg_namelen)) goto out_err; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err); if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) goto pipe_err; while (sent < len) { size = len-sent; if (size > cf_sk->maxframe) size = cf_sk->maxframe; /* If size is more than half of sndbuf, chop up message */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; if (size > SKB_MAX_ALLOC) size = SKB_MAX_ALLOC; skb = sock_alloc_send_skb(sk, size + cf_sk->headroom + cf_sk->tailroom, msg->msg_flags&MSG_DONTWAIT, &err); if (skb == NULL) goto out_err; skb_reserve(skb, cf_sk->headroom); /* * If you pass two values to the sock_alloc_send_skb * it tries to grab the large buffer with GFP_NOFS * (which can fail easily), and if it fails grab the * fallback size buffer which is under a page and will * succeed. [Alan] */ size = min_t(int, size, skb_tailroom(skb)); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err) { kfree_skb(skb); goto out_err; } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); if (err < 0) /* skb is already freed */ goto pipe_err; sent += size; } return sent; pipe_err: if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: return sent ? : err; } static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int linksel; if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED) return -ENOPROTOOPT; switch (opt) { case CAIFSO_LINK_SELECT: if (ol < sizeof(int)) return -EINVAL; if (lvl != SOL_CAIF) goto bad_sol; if (copy_from_sockptr(&linksel, ov, sizeof(int))) return -EINVAL; lock_sock(&(cf_sk->sk)); cf_sk->conn_req.link_selector = linksel; release_sock(&cf_sk->sk); return 0; case CAIFSO_REQ_PARAM: if (lvl != SOL_CAIF) goto bad_sol; if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL) return -ENOPROTOOPT; lock_sock(&(cf_sk->sk)); if (ol > sizeof(cf_sk->conn_req.param.data) || copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) { release_sock(&cf_sk->sk); return -EINVAL; } cf_sk->conn_req.param.size = ol; release_sock(&cf_sk->sk); return 0; default: return -ENOPROTOOPT; } return 0; bad_sol: return -ENOPROTOOPT; } /* * caif_connect() - Connect a CAIF Socket * Copied and modified af_irda.c:irda_connect(). * * Note : by consulting "errno", the user space caller may learn the cause * of the failure. Most of them are visible in the function, others may come * from subroutines called and are listed here : * o -EAFNOSUPPORT: bad socket family or type. * o -ESOCKTNOSUPPORT: bad socket type or protocol * o -EINVAL: bad socket address, or CAIF link type * o -ECONNREFUSED: remote end refused the connection. * o -EINPROGRESS: connect request sent but timed out (or non-blocking) * o -EISCONN: already connected. * o -ETIMEDOUT: Connection timed out (send timeout) * o -ENODEV: No link layer to send request * o -ECONNRESET: Received Shutdown indication or lost link layer * o -ENOMEM: Out of memory * * State Strategy: * o sk_state: holds the CAIF_* protocol state, it's updated by * caif_ctrl_cb. * o sock->state: holds the SS_* socket state and is updated by connect and * disconnect. */ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); long timeo; int err; int ifindex, headroom, tailroom; unsigned int mtu; struct net_device *dev; lock_sock(sk); err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) goto out; err = -EAFNOSUPPORT; if (uaddr->sa_family != AF_CAIF) goto out; switch (sock->state) { case SS_UNCONNECTED: /* Normal case, a fresh connect */ caif_assert(sk->sk_state == CAIF_DISCONNECTED); break; case SS_CONNECTING: switch (sk->sk_state) { case CAIF_CONNECTED: sock->state = SS_CONNECTED; err = -EISCONN; goto out; case CAIF_DISCONNECTED: /* Reconnect allowed */ break; case CAIF_CONNECTING: err = -EALREADY; if (flags & O_NONBLOCK) goto out; goto wait_connect; } break; case SS_CONNECTED: caif_assert(sk->sk_state == CAIF_CONNECTED || sk->sk_state == CAIF_DISCONNECTED); if (sk->sk_shutdown & SHUTDOWN_MASK) { /* Allow re-connect after SHUTDOWN_IND */ caif_disconnect_client(sock_net(sk), &cf_sk->layer); caif_free_client(&cf_sk->layer); break; } /* No reconnect on a seqpacket socket */ err = -EISCONN; goto out; case SS_DISCONNECTING: case SS_FREE: caif_assert(1); /*Should never happen */ break; } sk->sk_state = CAIF_DISCONNECTED; sock->state = SS_UNCONNECTED; sk_stream_kill_queues(&cf_sk->sk); err = -EINVAL; if (addr_len != sizeof(struct sockaddr_caif)) goto out; memcpy(&cf_sk->conn_req.sockaddr, uaddr, sizeof(struct sockaddr_caif)); /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = CAIF_CONNECTING; /* Check priority value comming from socket */ /* if priority value is out of range it will be ajusted */ if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX) cf_sk->conn_req.priority = CAIF_PRIO_MAX; else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN) cf_sk->conn_req.priority = CAIF_PRIO_MIN; else cf_sk->conn_req.priority = cf_sk->sk.sk_priority; /*ifindex = id of the interface.*/ cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if; cf_sk->layer.receive = caif_sktrecv_cb; err = caif_connect_client(sock_net(sk), &cf_sk->conn_req, &cf_sk->layer, &ifindex, &headroom, &tailroom); if (err < 0) { cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; goto out; } err = -ENODEV; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { rcu_read_unlock(); goto out; } cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); mtu = dev->mtu; rcu_read_unlock(); cf_sk->tailroom = tailroom; cf_sk->maxframe = mtu - (headroom + tailroom); if (cf_sk->maxframe < 1) { pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); err = -ENODEV; goto out; } err = -EINPROGRESS; wait_connect: if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK)) goto out; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); release_sock(sk); err = -ERESTARTSYS; timeo = wait_event_interruptible_timeout(*sk_sleep(sk), sk->sk_state != CAIF_CONNECTING, timeo); lock_sock(sk); if (timeo < 0) goto out; /* -ERESTARTSYS */ err = -ETIMEDOUT; if (timeo == 0 && sk->sk_state != CAIF_CONNECTED) goto out; if (sk->sk_state != CAIF_CONNECTED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); if (!err) err = -ECONNREFUSED; goto out; } sock->state = SS_CONNECTED; err = 0; out: release_sock(sk); return err; } /* * caif_release() - Disconnect a CAIF Socket * Copied and modified af_irda.c:irda_release(). */ static int caif_release(struct socket *sock) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (!sk) return 0; set_tx_flow_off(cf_sk); /* * Ensure that packets are not queued after this point in time. * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock, * this ensures no packets when sock is dead. */ spin_lock_bh(&sk->sk_receive_queue.lock); sock_set_flag(sk, SOCK_DEAD); spin_unlock_bh(&sk->sk_receive_queue.lock); sock->sk = NULL; WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir)); debugfs_remove_recursive(cf_sk->debugfs_socket_dir); lock_sock(&(cf_sk->sk)); sk->sk_state = CAIF_DISCONNECTED; sk->sk_shutdown = SHUTDOWN_MASK; caif_disconnect_client(sock_net(sk), &cf_sk->layer); cf_sk->sk.sk_socket->state = SS_DISCONNECTING; wake_up_interruptible_poll(sk_sleep(sk), EPOLLERR|EPOLLHUP); sock_orphan(sk); sk_stream_kill_queues(&cf_sk->sk); release_sock(sk); sock_put(sk); return 0; } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ static __poll_t caif_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ if (sk->sk_err) mask |= EPOLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= EPOLLIN | EPOLLRDNORM; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (sock_writeable(sk) && tx_flow_is_on(cf_sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static const struct proto_ops caif_seqpacket_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops caif_stream_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, }; /* This function is called when a socket is finally destroyed. */ static void caif_sock_destructor(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); caif_assert(!refcount_read(&sk->sk_wmem_alloc)); caif_assert(sk_unhashed(sk)); caif_assert(!sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_debug("Attempt to release alive CAIF socket: %p\n", sk); return; } sk_stream_kill_queues(&cf_sk->sk); WARN_ON_ONCE(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } static int caif_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk = NULL; struct caifsock *cf_sk = NULL; static struct proto prot = {.name = "PF_CAIF", .owner = THIS_MODULE, .obj_size = sizeof(struct caifsock), .useroffset = offsetof(struct caifsock, conn_req.param), .usersize = sizeof_field(struct caifsock, conn_req.param) }; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN)) return -EPERM; /* * The sock->type specifies the socket type to use. * The CAIF socket is a packet stream in the sense * that it is packet based. CAIF trusts the reliability * of the link, no resending is implemented. */ if (sock->type == SOCK_SEQPACKET) sock->ops = &caif_seqpacket_ops; else if (sock->type == SOCK_STREAM) sock->ops = &caif_stream_ops; else return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= CAIFPROTO_MAX) return -EPROTONOSUPPORT; /* * Set the socket state to unconnected. The socket state * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; cf_sk = container_of(sk, struct caifsock, sk); /* Store the protocol */ sk->sk_protocol = (unsigned char) protocol; /* Initialize default priority for well-known cases */ switch (protocol) { case CAIFPROTO_AT: sk->sk_priority = TC_PRIO_CONTROL; break; case CAIFPROTO_RFM: sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; break; default: sk->sk_priority = TC_PRIO_BESTEFFORT; } /* * Lock in order to try to stop someone from opening the socket * too early. */ lock_sock(&(cf_sk->sk)); /* Initialize the nozero default sock structure data. */ sock_init_data(sock, sk); sk->sk_destruct = caif_sock_destructor; mutex_init(&cf_sk->readlock); /* single task reading lock */ cf_sk->layer.ctrlcmd = caif_ctrl_cb; cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; set_tx_flow_off(cf_sk); set_rx_flow_on(cf_sk); /* Set default options on configuration */ cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY; cf_sk->conn_req.protocol = protocol; release_sock(&cf_sk->sk); return 0; } static const struct net_proto_family caif_family_ops = { .family = PF_CAIF, .create = caif_create, .owner = THIS_MODULE, }; static int __init caif_sktinit_module(void) { return sock_register(&caif_family_ops); } static void __exit caif_sktexit_module(void) { sock_unregister(PF_CAIF); } module_init(caif_sktinit_module); module_exit(caif_sktexit_module);
95 32 100 100 100 100 100 100 100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 // SPDX-License-Identifier: GPL-2.0 /* * module.c - module sysfs fun for drivers */ #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include "base.h" static char *make_driver_name(const struct device_driver *drv) { char *driver_name; driver_name = kasprintf(GFP_KERNEL, "%s:%s", drv->bus->name, drv->name); if (!driver_name) return NULL; return driver_name; } static void module_create_drivers_dir(struct module_kobject *mk) { static DEFINE_MUTEX(drivers_dir_mutex); mutex_lock(&drivers_dir_mutex); if (mk && !mk->drivers_dir) mk->drivers_dir = kobject_create_and_add("drivers", &mk->kobj); mutex_unlock(&drivers_dir_mutex); } int module_add_driver(struct module *mod, const struct device_driver *drv) { char *driver_name; struct module_kobject *mk = NULL; int ret; if (!drv) return 0; if (mod) mk = &mod->mkobj; else if (drv->mod_name) { /* Lookup or create built-in module entry in /sys/modules */ mk = lookup_or_create_module_kobject(drv->mod_name); if (mk) { /* remember our module structure */ drv->p->mkobj = mk; /* lookup_or_create_module_kobject took a reference */ kobject_put(&mk->kobj); } } if (!mk) return 0; ret = sysfs_create_link(&drv->p->kobj, &mk->kobj, "module"); if (ret) return ret; driver_name = make_driver_name(drv); if (!driver_name) { ret = -ENOMEM; goto out_remove_kobj; } module_create_drivers_dir(mk); if (!mk->drivers_dir) { ret = -EINVAL; goto out_free_driver_name; } ret = sysfs_create_link(mk->drivers_dir, &drv->p->kobj, driver_name); if (ret) goto out_remove_drivers_dir; kfree(driver_name); return 0; out_remove_drivers_dir: sysfs_remove_link(mk->drivers_dir, driver_name); out_free_driver_name: kfree(driver_name); out_remove_kobj: sysfs_remove_link(&drv->p->kobj, "module"); return ret; } void module_remove_driver(const struct device_driver *drv) { struct module_kobject *mk = NULL; char *driver_name; if (!drv) return; sysfs_remove_link(&drv->p->kobj, "module"); if (drv->owner) mk = &drv->owner->mkobj; else if (drv->p->mkobj) mk = drv->p->mkobj; if (mk && mk->drivers_dir) { driver_name = make_driver_name(drv); if (driver_name) { sysfs_remove_link(mk->drivers_dir, driver_name); kfree(driver_name); } } }
30 1 1 2 23 2 2 7 4 2 7 4 1 1 1 1 14 1 4 2 2 13 13 12 4 3 2 1 9 4 3 2 9 9 9 12 13 2 3 6 2 2 3 6 2 535 535 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_vlan.h> #include <net/tc_act/tc_vlan.h> static struct tc_action_ops act_vlan_ops; TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; int err; u16 tci; tcf_lastuse_update(&v->tcf_tm); tcf_action_update_bstats(&v->common, skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. */ if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: err = skb_vlan_pop(skb); if (err) goto drop; break; case TCA_VLAN_ACT_PUSH: err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid | (p->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; case TCA_VLAN_ACT_MODIFY: /* No-op if no vlan tag (either hw-accel or in-payload) */ if (!skb_vlan_tagged(skb)) goto out; /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); __vlan_hwaccel_clear_tag(skb); } else { /* in-payload vlan tag, pop it */ err = __skb_vlan_pop(skb, &tci); if (err) goto drop; } /* replace the vid */ tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ if (p->tcfv_push_prio_exists) { tci &= ~VLAN_PRIO_MASK; tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } /* put updated tci as hwaccel tag */ __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci); break; case TCA_VLAN_ACT_POP_ETH: err = skb_eth_pop(skb); if (err) goto drop; break; case TCA_VLAN_ACT_PUSH_ETH: err = skb_eth_push(skb, p->tcfv_push_dst, p->tcfv_push_src); if (err) goto drop; break; default: BUG(); } out: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); skb_reset_mac_len(skb); return p->action; drop: tcf_action_inc_drop_qstats(&v->common); return TC_ACT_SHOT; } static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { [TCA_VLAN_UNSPEC] = { .strict_start_type = TCA_VLAN_PUSH_ETH_DST }, [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 }, [TCA_VLAN_PUSH_ETH_DST] = NLA_POLICY_ETH_ADDR, [TCA_VLAN_PUSH_ETH_SRC] = NLA_POLICY_ETH_ADDR, }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool push_prio_exists = false; struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; int action; u16 push_vid = 0; __be16 push_proto = 0; u8 push_prio = 0; bool exists = false; int ret = 0, err; u32 index; if (!nla) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_VLAN_MAX, nla, vlan_policy, NULL); if (err < 0) return err; if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; switch (parm->v_action) { case TCA_VLAN_ACT_POP: break; case TCA_VLAN_ACT_PUSH: case TCA_VLAN_ACT_MODIFY: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -ERANGE; } if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) { push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]); switch (push_proto) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EPROTONOSUPPORT; } } else { push_proto = htons(ETH_P_8021Q); } push_prio_exists = !!tb[TCA_VLAN_PUSH_VLAN_PRIORITY]; if (push_prio_exists) push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; case TCA_VLAN_ACT_POP_ETH: break; case TCA_VLAN_ACT_PUSH_ETH: if (!tb[TCA_VLAN_PUSH_ETH_DST] || !tb[TCA_VLAN_PUSH_ETH_SRC]) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } action = parm->v_action; if (!exists) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_vlan_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; v = to_vlan(*a); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { err = -ENOMEM; goto put_chain; } p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; p->tcfv_push_prio_exists = push_prio_exists || action == TCA_VLAN_ACT_PUSH; p->tcfv_push_proto = push_proto; if (action == TCA_VLAN_ACT_PUSH_ETH) { nla_memcpy(&p->tcfv_push_dst, tb[TCA_VLAN_PUSH_ETH_DST], ETH_ALEN); nla_memcpy(&p->tcfv_push_src, tb[TCA_VLAN_PUSH_ETH_SRC], ETH_ALEN); } p->action = parm->action; spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); spin_unlock_bh(&v->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static void tcf_vlan_cleanup(struct tc_action *a) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; p = rcu_dereference_protected(v->vlan_p, 1); if (p) kfree_rcu(p, rcu); } static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; struct tc_vlan opt = { .index = v->tcf_index, .refcnt = refcount_read(&v->tcf_refcnt) - ref, .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, }; struct tcf_t t; rcu_read_lock(); p = rcu_dereference(v->vlan_p); opt.action = p->action; opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((p->tcfv_action == TCA_VLAN_ACT_PUSH || p->tcfv_action == TCA_VLAN_ACT_MODIFY) && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, p->tcfv_push_proto) || (p->tcfv_push_prio_exists && nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, p->tcfv_push_prio)))) goto nla_put_failure; if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) { if (nla_put(skb, TCA_VLAN_PUSH_ETH_DST, ETH_ALEN, p->tcfv_push_dst)) goto nla_put_failure; if (nla_put(skb, TCA_VLAN_PUSH_ETH_SRC, ETH_ALEN, p->tcfv_push_src)) goto nla_put_failure; } tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; rcu_read_unlock(); return skb->len; nla_put_failure: rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_vlan *v = to_vlan(a); struct tcf_t *tm = &v->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static size_t tcf_vlan_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_vlan)) + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_ID */ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_PROTOCOL */ + nla_total_size(sizeof(u8)); /* TCA_VLAN_PUSH_VLAN_PRIORITY */ } static int tcf_vlan_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; switch (tcf_vlan_action(act)) { case TCA_VLAN_ACT_PUSH: entry->id = FLOW_ACTION_VLAN_PUSH; entry->vlan.vid = tcf_vlan_push_vid(act); entry->vlan.proto = tcf_vlan_push_proto(act); entry->vlan.prio = tcf_vlan_push_prio(act); break; case TCA_VLAN_ACT_POP: entry->id = FLOW_ACTION_VLAN_POP; break; case TCA_VLAN_ACT_MODIFY: entry->id = FLOW_ACTION_VLAN_MANGLE; entry->vlan.vid = tcf_vlan_push_vid(act); entry->vlan.proto = tcf_vlan_push_proto(act); entry->vlan.prio = tcf_vlan_push_prio(act); break; case TCA_VLAN_ACT_POP_ETH: entry->id = FLOW_ACTION_VLAN_POP_ETH; break; case TCA_VLAN_ACT_PUSH_ETH: entry->id = FLOW_ACTION_VLAN_PUSH_ETH; tcf_vlan_push_eth(entry->vlan_push_eth.src, entry->vlan_push_eth.dst, act); break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported vlan action mode offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; switch (tcf_vlan_action(act)) { case TCA_VLAN_ACT_PUSH: fl_action->id = FLOW_ACTION_VLAN_PUSH; break; case TCA_VLAN_ACT_POP: fl_action->id = FLOW_ACTION_VLAN_POP; break; case TCA_VLAN_ACT_MODIFY: fl_action->id = FLOW_ACTION_VLAN_MANGLE; break; case TCA_VLAN_ACT_POP_ETH: fl_action->id = FLOW_ACTION_VLAN_POP_ETH; break; case TCA_VLAN_ACT_PUSH_ETH: fl_action->id = FLOW_ACTION_VLAN_PUSH_ETH; break; default: return -EOPNOTSUPP; } } return 0; } static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .id = TCA_ID_VLAN, .owner = THIS_MODULE, .act = tcf_vlan_act, .dump = tcf_vlan_dump, .init = tcf_vlan_init, .cleanup = tcf_vlan_cleanup, .stats_update = tcf_vlan_stats_update, .get_fill_size = tcf_vlan_get_fill_size, .offload_act_setup = tcf_vlan_offload_act_setup, .size = sizeof(struct tcf_vlan), }; MODULE_ALIAS_NET_ACT("vlan"); static __net_init int vlan_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id); return tc_action_net_init(net, tn, &act_vlan_ops); } static void __net_exit vlan_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_vlan_ops.net_id); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit_batch = vlan_exit_net, .id = &act_vlan_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init vlan_init_module(void) { return tcf_register_action(&act_vlan_ops, &vlan_net_ops); } static void __exit vlan_cleanup_module(void) { tcf_unregister_action(&act_vlan_ops, &vlan_net_ops); } module_init(vlan_init_module); module_exit(vlan_cleanup_module); MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); MODULE_DESCRIPTION("vlan manipulation actions"); MODULE_LICENSE("GPL v2");
5 9 7 2 2 1 2 5 4 10 1 1 1 3 5 7 1 3 1 3 4 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 // SPDX-License-Identifier: GPL-2.0-only /* * File: datagram.c * * Datagram (ISI) Phonet sockets * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/socket.h> #include <asm/ioctls.h> #include <net/sock.h> #include <linux/phonet.h> #include <linux/export.h> #include <net/phonet/phonet.h> static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb); /* associated socket ceases to exist */ static void pn_sock_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int pn_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; switch (cmd) { case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: { u32 res = *karg; if (res >= 256) return -EINVAL; if (cmd == SIOCPNADDRESOURCE) return pn_sock_bind_res(sk, res); else return pn_sock_unbind_res(sk, res); } } return -ENOIOCTLCMD; } /* Destroy socket. All references are gone. */ static void pn_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } static int pn_init(struct sock *sk) { sk->sk_destruct = pn_destruct; return 0; } static int pn_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_pn *, target, msg->msg_name); struct sk_buff *skb; int err; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) return -EOPNOTSUPP; if (target == NULL) return -EDESTADDRREQ; if (msg->msg_namelen < sizeof(struct sockaddr_pn)) return -EINVAL; if (target->spn_family != AF_PHONET) return -EAFNOSUPPORT; skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) return err; skb_reserve(skb, MAX_PHONET_HEADER); err = memcpy_from_msg((void *)skb_put(skb, len), msg, len); if (err < 0) { kfree_skb(skb); return err; } /* * Fill in the Phonet header and * finally pass the packet forwards. */ err = pn_skb_send(sk, skb, target); /* If ok, return len. */ return (err >= 0) ? len : err; } static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_buff *skb = NULL; struct sockaddr_pn sa; int rval = -EOPNOTSUPP; int copylen; if (flags & ~(MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) goto out_nofree; skb = skb_recv_datagram(sk, flags, &rval); if (skb == NULL) goto out_nofree; pn_skb_get_src_sockaddr(skb, &sa); copylen = skb->len; if (len < copylen) { msg->msg_flags |= MSG_TRUNC; copylen = len; } rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; } rval = (flags & MSG_TRUNC) ? skb->len : copylen; if (msg->msg_name != NULL) { __sockaddr_check_size(sizeof(sa)); memcpy(msg->msg_name, &sa, sizeof(sa)); *addr_len = sizeof(sa); } out: skb_free_datagram(sk, skb); out_nofree: return rval; } /* Queue an skb for a sock. */ static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); return err ? NET_RX_DROP : NET_RX_SUCCESS; } /* Module registration */ static struct proto pn_proto = { .close = pn_sock_close, .ioctl = pn_ioctl, .init = pn_init, .sendmsg = pn_sendmsg, .recvmsg = pn_recvmsg, .backlog_rcv = pn_backlog_rcv, .hash = pn_sock_hash, .unhash = pn_sock_unhash, .get_port = pn_sock_get_port, .obj_size = sizeof(struct pn_sock), .owner = THIS_MODULE, .name = "PHONET", }; static const struct phonet_protocol pn_dgram_proto = { .ops = &phonet_dgram_ops, .prot = &pn_proto, .sock_type = SOCK_DGRAM, }; int __init isi_register(void) { return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto); } void __exit isi_unregister(void) { phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto); }
1 1 2 4 11 11 9 24 3 7 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 /* SPDX-License-Identifier: GPL-2.0-only */ /* * An interface between IEEE802.15.4 device and rest of the kernel. * * Copyright (C) 2007-2012 Siemens AG * * Written by: * Pavel Smolenskiy <pavel.smolenskiy@gmail.com> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Maxim Osipov <maxim.osipov@siemens.com> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #ifndef IEEE802154_NETDEVICE_H #define IEEE802154_NETDEVICE_H #define IEEE802154_REQUIRED_SIZE(struct_type, member) \ (offsetof(typeof(struct_type), member) + \ sizeof(((typeof(struct_type) *)(NULL))->member)) #define IEEE802154_ADDR_OFFSET \ offsetof(typeof(struct sockaddr_ieee802154), addr) #define IEEE802154_MIN_NAMELEN (IEEE802154_ADDR_OFFSET + \ IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, addr_type)) #define IEEE802154_NAMELEN_SHORT (IEEE802154_ADDR_OFFSET + \ IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, short_addr)) #define IEEE802154_NAMELEN_LONG (IEEE802154_ADDR_OFFSET + \ IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, hwaddr)) #include <net/af_ieee802154.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/ieee802154.h> #include <net/cfg802154.h> struct ieee802154_beacon_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) u16 beacon_order:4, superframe_order:4, final_cap_slot:4, battery_life_ext:1, reserved0:1, pan_coordinator:1, assoc_permit:1; u8 gts_count:3, gts_reserved:4, gts_permit:1; u8 pend_short_addr_count:3, reserved1:1, pend_ext_addr_count:3, reserved2:1; #elif defined(__BIG_ENDIAN_BITFIELD) u16 assoc_permit:1, pan_coordinator:1, reserved0:1, battery_life_ext:1, final_cap_slot:4, superframe_order:4, beacon_order:4; u8 gts_permit:1, gts_reserved:4, gts_count:3; u8 reserved2:1, pend_ext_addr_count:3, reserved1:1, pend_short_addr_count:3; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; struct ieee802154_mac_cmd_pl { u8 cmd_id; } __packed; struct ieee802154_sechdr { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 level:3, key_id_mode:2, reserved:3; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:3, key_id_mode:2, level:3; #else #error "Please fix <asm/byteorder.h>" #endif u8 key_id; __le32 frame_counter; union { __le32 short_src; __le64 extended_src; }; }; struct ieee802154_hdr_fc { #if defined(__LITTLE_ENDIAN_BITFIELD) u16 type:3, security_enabled:1, frame_pending:1, ack_request:1, intra_pan:1, reserved:3, dest_addr_mode:2, version:2, source_addr_mode:2; #elif defined(__BIG_ENDIAN_BITFIELD) u16 reserved:1, intra_pan:1, ack_request:1, frame_pending:1, security_enabled:1, type:3, source_addr_mode:2, version:2, dest_addr_mode:2, reserved2:2; #else #error "Please fix <asm/byteorder.h>" #endif }; struct ieee802154_assoc_req_pl { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved1:1, device_type:1, power_source:1, rx_on_when_idle:1, assoc_type:1, reserved2:1, security_cap:1, alloc_addr:1; #elif defined(__BIG_ENDIAN_BITFIELD) u8 alloc_addr:1, security_cap:1, reserved2:1, assoc_type:1, rx_on_when_idle:1, power_source:1, device_type:1, reserved1:1; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; struct ieee802154_assoc_resp_pl { __le16 short_addr; u8 status; } __packed; enum ieee802154_frame_version { IEEE802154_2003_STD, IEEE802154_2006_STD, IEEE802154_STD, IEEE802154_RESERVED_STD, IEEE802154_MULTIPURPOSE_STD = IEEE802154_2003_STD, }; enum ieee802154_addressing_mode { IEEE802154_NO_ADDRESSING, IEEE802154_RESERVED, IEEE802154_SHORT_ADDRESSING, IEEE802154_EXTENDED_ADDRESSING, }; enum ieee802154_association_status { IEEE802154_ASSOCIATION_SUCCESSFUL = 0x00, IEEE802154_PAN_AT_CAPACITY = 0x01, IEEE802154_PAN_ACCESS_DENIED = 0x02, IEEE802154_HOPPING_SEQUENCE_OFFSET_DUP = 0x03, IEEE802154_FAST_ASSOCIATION_SUCCESSFUL = 0x80, }; enum ieee802154_disassociation_reason { IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE = 0x1, IEEE802154_DEVICE_WISHES_TO_LEAVE = 0x2, }; struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; struct ieee802154_addr source; struct ieee802154_addr dest; struct ieee802154_sechdr sec; }; struct ieee802154_beacon_frame { struct ieee802154_hdr mhr; struct ieee802154_beacon_hdr mac_pl; }; struct ieee802154_mac_cmd_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; }; struct ieee802154_beacon_req_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; }; struct ieee802154_association_req_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; struct ieee802154_assoc_req_pl assoc_req_pl; }; struct ieee802154_association_resp_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; struct ieee802154_assoc_resp_pl assoc_resp_pl; }; struct ieee802154_disassociation_notif_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; u8 disassoc_pl; }; /* pushes hdr onto the skb. fields of hdr->fc that can be calculated from * the contents of hdr will be, and the actual value of those bits in * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame * version, if SECEN is set. */ int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr); /* pulls the entire 802.15.4 header off of the skb, including the security * header, and performs pan id decompression */ int ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr); /* parses the frame control, sequence number of address fields in a given skb * and stores them into hdr, performing pan id decompression and length checks * to be suitable for use in header_ops.parse */ int ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr); /* parses the full 802.15.4 header a given skb and stores them into hdr, * performing pan id decompression and length checks to be suitable for use in * header_ops.parse */ int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr); /* pushes/pulls various frame types into/from an skb */ int ieee802154_beacon_push(struct sk_buff *skb, struct ieee802154_beacon_frame *beacon); int ieee802154_mac_cmd_push(struct sk_buff *skb, void *frame, const void *pl, unsigned int pl_len); int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb, struct ieee802154_mac_cmd_pl *mac_pl); int ieee802154_max_payload(const struct ieee802154_hdr *hdr); static inline int ieee802154_sechdr_authtag_len(const struct ieee802154_sechdr *sec) { switch (sec->level) { case IEEE802154_SCF_SECLEVEL_MIC32: case IEEE802154_SCF_SECLEVEL_ENC_MIC32: return 4; case IEEE802154_SCF_SECLEVEL_MIC64: case IEEE802154_SCF_SECLEVEL_ENC_MIC64: return 8; case IEEE802154_SCF_SECLEVEL_MIC128: case IEEE802154_SCF_SECLEVEL_ENC_MIC128: return 16; case IEEE802154_SCF_SECLEVEL_NONE: case IEEE802154_SCF_SECLEVEL_ENC: default: return 0; } } static inline int ieee802154_hdr_length(struct sk_buff *skb) { struct ieee802154_hdr hdr; int len = ieee802154_hdr_pull(skb, &hdr); if (len > 0) skb_push(skb, len); return len; } static inline bool ieee802154_addr_equal(const struct ieee802154_addr *a1, const struct ieee802154_addr *a2) { if (a1->pan_id != a2->pan_id || a1->mode != a2->mode) return false; if ((a1->mode == IEEE802154_ADDR_LONG && a1->extended_addr != a2->extended_addr) || (a1->mode == IEEE802154_ADDR_SHORT && a1->short_addr != a2->short_addr)) return false; return true; } static inline __le64 ieee802154_devaddr_from_raw(const void *raw) { u64 temp; memcpy(&temp, raw, IEEE802154_ADDR_LEN); return (__force __le64)swab64(temp); } static inline void ieee802154_devaddr_to_raw(void *raw, __le64 addr) { u64 temp = swab64((__force u64)addr); memcpy(raw, &temp, IEEE802154_ADDR_LEN); } static inline int ieee802154_sockaddr_check_size(struct sockaddr_ieee802154 *daddr, int len) { struct ieee802154_addr_sa *sa; int ret = 0; sa = &daddr->addr; if (len < IEEE802154_MIN_NAMELEN) return -EINVAL; switch (sa->addr_type) { case IEEE802154_ADDR_NONE: break; case IEEE802154_ADDR_SHORT: if (len < IEEE802154_NAMELEN_SHORT) ret = -EINVAL; break; case IEEE802154_ADDR_LONG: if (len < IEEE802154_NAMELEN_LONG) ret = -EINVAL; break; default: ret = -EINVAL; break; } return ret; } static inline void ieee802154_addr_from_sa(struct ieee802154_addr *a, const struct ieee802154_addr_sa *sa) { a->mode = sa->addr_type; a->pan_id = cpu_to_le16(sa->pan_id); switch (a->mode) { case IEEE802154_ADDR_SHORT: a->short_addr = cpu_to_le16(sa->short_addr); break; case IEEE802154_ADDR_LONG: a->extended_addr = ieee802154_devaddr_from_raw(sa->hwaddr); break; } } static inline void ieee802154_addr_to_sa(struct ieee802154_addr_sa *sa, const struct ieee802154_addr *a) { sa->addr_type = a->mode; sa->pan_id = le16_to_cpu(a->pan_id); switch (a->mode) { case IEEE802154_ADDR_SHORT: sa->short_addr = le16_to_cpu(a->short_addr); break; case IEEE802154_ADDR_LONG: ieee802154_devaddr_to_raw(sa->hwaddr, a->extended_addr); break; } } /* * A control block of skb passed between the ARPHRD_IEEE802154 device * and other stack parts. */ struct ieee802154_mac_cb { u8 lqi; u8 type; bool ackreq; bool secen; bool secen_override; u8 seclevel; bool seclevel_override; struct ieee802154_addr source; struct ieee802154_addr dest; }; static inline struct ieee802154_mac_cb *mac_cb(struct sk_buff *skb) { return (struct ieee802154_mac_cb *)skb->cb; } static inline struct ieee802154_mac_cb *mac_cb_init(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct ieee802154_mac_cb) > sizeof(skb->cb)); memset(skb->cb, 0, sizeof(struct ieee802154_mac_cb)); return mac_cb(skb); } enum { IEEE802154_LLSEC_DEVKEY_IGNORE, IEEE802154_LLSEC_DEVKEY_RESTRICT, IEEE802154_LLSEC_DEVKEY_RECORD, __IEEE802154_LLSEC_DEVKEY_MAX, }; #define IEEE802154_MAC_SCAN_ED 0 #define IEEE802154_MAC_SCAN_ACTIVE 1 #define IEEE802154_MAC_SCAN_PASSIVE 2 #define IEEE802154_MAC_SCAN_ORPHAN 3 struct ieee802154_mac_params { s8 transmit_power; u8 min_be; u8 max_be; u8 csma_retries; s8 frame_retries; bool lbt; struct wpan_phy_cca cca; s32 cca_ed_level; }; struct wpan_phy; enum { IEEE802154_LLSEC_PARAM_ENABLED = BIT(0), IEEE802154_LLSEC_PARAM_FRAME_COUNTER = BIT(1), IEEE802154_LLSEC_PARAM_OUT_LEVEL = BIT(2), IEEE802154_LLSEC_PARAM_OUT_KEY = BIT(3), IEEE802154_LLSEC_PARAM_KEY_SOURCE = BIT(4), IEEE802154_LLSEC_PARAM_PAN_ID = BIT(5), IEEE802154_LLSEC_PARAM_HWADDR = BIT(6), IEEE802154_LLSEC_PARAM_COORD_HWADDR = BIT(7), IEEE802154_LLSEC_PARAM_COORD_SHORTADDR = BIT(8), }; struct ieee802154_llsec_ops { int (*get_params)(struct net_device *dev, struct ieee802154_llsec_params *params); int (*set_params)(struct net_device *dev, const struct ieee802154_llsec_params *params, int changed); int (*add_key)(struct net_device *dev, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key); int (*del_key)(struct net_device *dev, const struct ieee802154_llsec_key_id *id); int (*add_dev)(struct net_device *dev, const struct ieee802154_llsec_device *llsec_dev); int (*del_dev)(struct net_device *dev, __le64 dev_addr); int (*add_devkey)(struct net_device *dev, __le64 device_addr, const struct ieee802154_llsec_device_key *key); int (*del_devkey)(struct net_device *dev, __le64 device_addr, const struct ieee802154_llsec_device_key *key); int (*add_seclevel)(struct net_device *dev, const struct ieee802154_llsec_seclevel *sl); int (*del_seclevel)(struct net_device *dev, const struct ieee802154_llsec_seclevel *sl); void (*lock_table)(struct net_device *dev); void (*get_table)(struct net_device *dev, struct ieee802154_llsec_table **t); void (*unlock_table)(struct net_device *dev); }; /* * This should be located at net_device->ml_priv * * get_phy should increment the reference counting on returned phy. * Use wpan_wpy_put to put that reference. */ struct ieee802154_mlme_ops { /* The following fields are optional (can be NULL). */ int (*assoc_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 channel, u8 page, u8 cap); int (*assoc_resp)(struct net_device *dev, struct ieee802154_addr *addr, __le16 short_addr, u8 status); int (*disassoc_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 reason); int (*start_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 channel, u8 page, u8 bcn_ord, u8 sf_ord, u8 pan_coord, u8 blx, u8 coord_realign); int (*scan_req)(struct net_device *dev, u8 type, u32 channels, u8 page, u8 duration); int (*set_mac_params)(struct net_device *dev, const struct ieee802154_mac_params *params); void (*get_mac_params)(struct net_device *dev, struct ieee802154_mac_params *params); const struct ieee802154_llsec_ops *llsec; }; static inline struct ieee802154_mlme_ops * ieee802154_mlme_ops(const struct net_device *dev) { return dev->ml_priv; } #endif
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM oom #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_OOM_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> #define PG_COUNT_TO_KB(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(oom_score_adj_update, TP_PROTO(struct task_struct *task), TP_ARGS(task), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN ) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->oom_score_adj) ); TRACE_EVENT(reclaim_retry_zone, TP_PROTO(struct zoneref *zoneref, int order, unsigned long reclaimable, unsigned long available, unsigned long min_wmark, int no_progress_loops, bool wmark_check), TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check), TP_STRUCT__entry( __field( int, node) __field( int, zone_idx) __field( int, order) __field( unsigned long, reclaimable) __field( unsigned long, available) __field( unsigned long, min_wmark) __field( int, no_progress_loops) __field( bool, wmark_check) ), TP_fast_assign( __entry->node = zonelist_node_idx(zoneref); __entry->zone_idx = zonelist_zone_idx(zoneref); __entry->order = order; __entry->reclaimable = reclaimable; __entry->available = available; __entry->min_wmark = min_wmark; __entry->no_progress_loops = no_progress_loops; __entry->wmark_check = wmark_check; ), TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d", __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE), __entry->order, __entry->reclaimable, __entry->available, __entry->min_wmark, __entry->no_progress_loops, __entry->wmark_check) ); TRACE_EVENT(mark_victim, TP_PROTO(struct task_struct *task, uid_t uid), TP_ARGS(task, uid), TP_STRUCT__entry( __field(int, pid) __string(comm, task->comm) __field(unsigned long, total_vm) __field(unsigned long, anon_rss) __field(unsigned long, file_rss) __field(unsigned long, shmem_rss) __field(uid_t, uid) __field(unsigned long, pgtables) __field(short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; __assign_str(comm); __entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm); __entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_ANONPAGES)); __entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_FILEPAGES)); __entry->shmem_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_SHMEMPAGES)); __entry->uid = uid; __entry->pgtables = mm_pgtables_bytes(task->mm) >> 10; __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s total-vm=%lukB anon-rss=%lukB file-rss:%lukB shmem-rss:%lukB uid=%u pgtables=%lukB oom_score_adj=%hd", __entry->pid, __get_str(comm), __entry->total_vm, __entry->anon_rss, __entry->file_rss, __entry->shmem_rss, __entry->uid, __entry->pgtables, __entry->oom_score_adj ) ); TRACE_EVENT(wake_reaper, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(start_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(finish_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(skip_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(compact_retry, TP_PROTO(int order, enum compact_priority priority, enum compact_result result, int retries, int max_retries, bool ret), TP_ARGS(order, priority, result, retries, max_retries, ret), TP_STRUCT__entry( __field( int, order) __field( int, priority) __field( int, result) __field( int, retries) __field( int, max_retries) __field( bool, ret) ), TP_fast_assign( __entry->order = order; __entry->priority = priority; __entry->result = compact_result_to_feedback(result); __entry->retries = retries; __entry->max_retries = max_retries; __entry->ret = ret; ), TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d", __entry->order, __print_symbolic(__entry->priority, COMPACTION_PRIORITY), __print_symbolic(__entry->result, COMPACTION_FEEDBACK), __entry->retries, __entry->max_retries, __entry->ret) ); #endif /* CONFIG_COMPACTION */ #endif /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /** * file phonet.h * * Phonet sockets kernel interface * * Copyright (C) 2008 Nokia Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ #ifndef _UAPILINUX_PHONET_H #define _UAPILINUX_PHONET_H #include <linux/types.h> #include <linux/socket.h> /* Automatic protocol selection */ #define PN_PROTO_TRANSPORT 0 /* Phonet datagram socket */ #define PN_PROTO_PHONET 1 /* Phonet pipe */ #define PN_PROTO_PIPE 2 #define PHONET_NPROTO 3 /* Socket options for SOL_PNPIPE level */ #define PNPIPE_ENCAP 1 #define PNPIPE_IFINDEX 2 #define PNPIPE_HANDLE 3 #define PNPIPE_INITSTATE 4 #define PNADDR_ANY 0 #define PNADDR_BROADCAST 0xFC #define PNPORT_RESOURCE_ROUTING 0 /* Values for PNPIPE_ENCAP option */ #define PNPIPE_ENCAP_NONE 0 #define PNPIPE_ENCAP_IP 1 /* ioctls */ #define SIOCPNGETOBJECT (SIOCPROTOPRIVATE + 0) #define SIOCPNENABLEPIPE (SIOCPROTOPRIVATE + 13) #define SIOCPNADDRESOURCE (SIOCPROTOPRIVATE + 14) #define SIOCPNDELRESOURCE (SIOCPROTOPRIVATE + 15) /* Phonet protocol header */ struct phonethdr { __u8 pn_rdev; __u8 pn_sdev; __u8 pn_res; __be16 pn_length; __u8 pn_robj; __u8 pn_sobj; } __attribute__((packed)); /* Common Phonet payload header */ struct phonetmsg { __u8 pn_trans_id; /* transaction ID */ __u8 pn_msg_id; /* message type */ union { struct { __u8 pn_submsg_id; /* message subtype */ __u8 pn_data[5]; } base; struct { __u16 pn_e_res_id; /* extended resource ID */ __u8 pn_e_submsg_id; /* message subtype */ __u8 pn_e_data[3]; } ext; } pn_msg_u; }; #define PN_COMMON_MESSAGE 0xF0 #define PN_COMMGR 0x10 #define PN_PREFIX 0xE0 /* resource for extended messages */ #define pn_submsg_id pn_msg_u.base.pn_submsg_id #define pn_e_submsg_id pn_msg_u.ext.pn_e_submsg_id #define pn_e_res_id pn_msg_u.ext.pn_e_res_id #define pn_data pn_msg_u.base.pn_data #define pn_e_data pn_msg_u.ext.pn_e_data /* data for unreachable errors */ #define PN_COMM_SERVICE_NOT_IDENTIFIED_RESP 0x01 #define PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP 0x14 #define pn_orig_msg_id pn_data[0] #define pn_status pn_data[1] #define pn_e_orig_msg_id pn_e_data[0] #define pn_e_status pn_e_data[1] /* Phonet socket address structure */ struct sockaddr_pn { __kernel_sa_family_t spn_family; __u8 spn_obj; __u8 spn_dev; __u8 spn_resource; __u8 spn_zero[sizeof(struct sockaddr) - sizeof(__kernel_sa_family_t) - 3]; } __attribute__((packed)); /* Well known address */ #define PN_DEV_PC 0x10 static inline __u16 pn_object(__u8 addr, __u16 port) { return (addr << 8) | (port & 0x3ff); } static inline __u8 pn_obj(__u16 handle) { return handle & 0xff; } static inline __u8 pn_dev(__u16 handle) { return handle >> 8; } static inline __u16 pn_port(__u16 handle) { return handle & 0x3ff; } static inline __u8 pn_addr(__u16 handle) { return (handle >> 8) & 0xfc; } static inline void pn_sockaddr_set_addr(struct sockaddr_pn *spn, __u8 addr) { spn->spn_dev &= 0x03; spn->spn_dev |= addr & 0xfc; } static inline void pn_sockaddr_set_port(struct sockaddr_pn *spn, __u16 port) { spn->spn_dev &= 0xfc; spn->spn_dev |= (port >> 8) & 0x03; spn->spn_obj = port & 0xff; } static inline void pn_sockaddr_set_object(struct sockaddr_pn *spn, __u16 handle) { spn->spn_dev = pn_dev(handle); spn->spn_obj = pn_obj(handle); } static inline void pn_sockaddr_set_resource(struct sockaddr_pn *spn, __u8 resource) { spn->spn_resource = resource; } static inline __u8 pn_sockaddr_get_addr(const struct sockaddr_pn *spn) { return spn->spn_dev & 0xfc; } static inline __u16 pn_sockaddr_get_port(const struct sockaddr_pn *spn) { return ((spn->spn_dev & 0x03) << 8) | spn->spn_obj; } static inline __u16 pn_sockaddr_get_object(const struct sockaddr_pn *spn) { return pn_object(spn->spn_dev, spn->spn_obj); } static inline __u8 pn_sockaddr_get_resource(const struct sockaddr_pn *spn) { return spn->spn_resource; } /* Phonet device ioctl requests */ #endif /* _UAPILINUX_PHONET_H */
7 8 8 44 1 1 54 2 52 23 52 44 23 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/netfilter/xt_IDLETIMER.c * * Netfilter module to trigger a timer when packet matches. * After timer expires a kevent will be sent. * * Copyright (C) 2004, 2010 Nokia Corporation * Written by Timo Teras <ext-timo.teras@nokia.com> * * Converted to x_tables and reworked for upstream inclusion * by Luciano Coelho <luciano.coelho@nokia.com> * * Contact: Luciano Coelho <luciano.coelho@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/timer.h> #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_IDLETIMER.h> #include <linux/kdev_t.h> #include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/sysfs.h> struct idletimer_tg { struct list_head entry; struct alarm alarm; struct timer_list timer; struct work_struct work; struct kobject *kobj; struct device_attribute attr; unsigned int refcnt; u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); static DEFINE_MUTEX(list_mutex); static struct kobject *idletimer_tg_kobj; static struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; } return NULL; } static ssize_t idletimer_tg_show(struct device *dev, struct device_attribute *attr, char *buf) { struct idletimer_tg *timer; unsigned long expires = 0; struct timespec64 ktimespec = {}; long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); if (timer) { if (timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); ktimespec = ktime_to_timespec64(expires_alarm); time_diff = ktimespec.tv_sec; } else { expires = timer->timer.expires; time_diff = jiffies_to_msecs(expires - jiffies) / 1000; } } mutex_unlock(&list_mutex); if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) return sysfs_emit(buf, "%ld\n", time_diff); return sysfs_emit(buf, "0\n"); } static void idletimer_tg_work(struct work_struct *work) { struct idletimer_tg *timer = container_of(work, struct idletimer_tg, work); sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); } static void idletimer_tg_expired(struct timer_list *t) { struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; ret = xt_check_proc_name(name, size); if (ret < 0) return ret; if (!strcmp(name, "power") || !strcmp(name, "subsystem") || !strcmp(name, "uevent")) return -EINVAL; return 0; } static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } list_add(&info->timer->entry, &idletimer_tg_list); timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } /* notify userspace */ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); list_add(&info->timer->entry, &idletimer_tg_list); pr_debug("timer type value is %u", info->timer_type); info->timer->timer_type = info->timer_type; info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout; alarm_init(&info->timer->alarm, ALARM_BOOTTIME, idletimer_tg_alarmproc); info->timer->alarm.data = info->timer; tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; } static int idletimer_tg_helper(struct idletimer_tg_info *info) { if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; } if (info->timeout >= INT_MAX / 1000) { pr_debug("timeout value is too big\n"); return -EINVAL; } if (info->label[0] == '\0' || strnlen(info->label, MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } return 0; } static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) { struct idletimer_tg_info *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); ret = idletimer_tg_helper(info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) { struct idletimer_tg_info_v1 *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); if (info->send_nl_msg) return -EOPNOTSUPP; ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } if (info->timer_type > XT_IDLETIMER_ALARM) { pr_debug("invalid value for timer type\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { if (info->timer->timer_type != info->timer_type) { pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); mutex_unlock(&list_mutex); return -EINVAL; } info->timer->refcnt++; if (info->timer_type & XT_IDLETIMER_ALARM) { /* calculate remaining expiry time */ ktime_t tout = alarm_expires_remaining(&info->timer->alarm); struct timespec64 ktimespec = ktime_to_timespec64(tout); if (ktimespec.tv_sec > 0) { pr_debug("time_expiry_remaining %lld\n", ktimespec.tv_sec); alarm_start_relative(&info->timer->alarm, tout); } } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create_v1(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); timer_shutdown_sync(&info->timer->timer); cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { alarm_cancel(&info->timer->alarm); } else { timer_shutdown_sync(&info->timer->timer); } cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { .name = "IDLETIMER", .family = NFPROTO_IPV4, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV4, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "IDLETIMER", .family = NFPROTO_IPV6, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV6, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #endif }; static struct class *idletimer_tg_class; static struct device *idletimer_tg_device; static int __init idletimer_tg_init(void) { int err; idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); goto out; } idletimer_tg_device = device_create(idletimer_tg_class, NULL, MKDEV(0, 0), NULL, "timers"); err = PTR_ERR(idletimer_tg_device); if (IS_ERR(idletimer_tg_device)) { pr_debug("couldn't register system device\n"); goto out_class; } idletimer_tg_kobj = &idletimer_tg_device->kobj; err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; } return 0; out_dev: device_destroy(idletimer_tg_class, MKDEV(0, 0)); out_class: class_destroy(idletimer_tg_class); out: return err; } static void __exit idletimer_tg_exit(void) { xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); } module_init(idletimer_tg_init); module_exit(idletimer_tg_exit); MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("ipt_IDLETIMER"); MODULE_ALIAS("ip6t_IDLETIMER");
15 16 3 6 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI connection handling. */ #include <linux/export.h> #include <linux/debugfs.h> #include <linux/errqueue.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/iso.h> #include <net/bluetooth/mgmt.h> #include "smp.h" #include "eir.h" struct sco_param { u16 pkt_type; u16 max_latency; u8 retrans_effort; }; struct conn_handle_t { struct hci_conn *conn; __u16 handle; }; static const struct sco_param esco_param_cvsd[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0007, 0x01 }, /* S1 */ { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0x01 }, /* D1 */ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0x01 }, /* D0 */ }; static const struct sco_param sco_param_cvsd[] = { { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0xff }, /* D1 */ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */ }; static const struct sco_param esco_param_msbc[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */ }; /* This function requires the caller holds hdev->lock */ void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) { struct hci_conn_params *params; struct hci_dev *hdev = conn->hdev; struct smp_irk *irk; bdaddr_t *bdaddr; u8 bdaddr_type; bdaddr = &conn->dst; bdaddr_type = conn->dst_type; /* Check if we need to convert to identity address */ irk = hci_get_irk(hdev, bdaddr, bdaddr_type); if (irk) { bdaddr = &irk->bdaddr; bdaddr_type = irk->addr_type; } params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr, bdaddr_type); if (!params) return; if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); params->conn = NULL; } if (!params->explicit_connect) return; /* If the status indicates successful cancellation of * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to * connect. The only exception is explicit connect requests * where a timeout + cancel does indicate an actual failure. */ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) mgmt_connect_failed(hdev, conn, status); /* The connection attempt was doing scan for new RPA, and is * in scan phase. If params are not associated with any other * autoconnect action, remove them completely. If they are, just unmark * them as waiting for connection, by clearing explicit_connect field. */ params->explicit_connect = false; hci_pend_le_list_del_init(params); switch (params->auto_connect) { case HCI_AUTO_CONN_EXPLICIT: hci_conn_params_del(hdev, bdaddr, bdaddr_type); /* return instead of break to avoid duplicate scan update */ return; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: hci_pend_le_list_add(params, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: hci_pend_le_list_add(params, &hdev->pend_le_reports); break; default: break; } hci_update_passive_scan(hdev); } static void hci_conn_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) hci_remove_link_key(hdev, &conn->dst); hci_chan_list_flush(conn); if (HCI_CONN_HANDLE_UNSET(conn->handle)) ida_free(&hdev->unset_handle_ida, conn->handle); if (conn->cleanup) conn->cleanup(conn); if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_CVSD: case SCO_AIRMODE_TRANSP: if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO); break; } } else { if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); } debugfs_remove_recursive(conn->debugfs); hci_conn_del_sysfs(conn); hci_dev_put(hdev); } int hci_disconnect(struct hci_conn *conn, __u8 reason) { BT_DBG("hcon %p", conn); /* When we are central of an established connection and it enters * the disconnect timeout, then go ahead and try to read the * current clock offset. Processing of the result is done * within the event handling and hci_clock_offset_evt function. */ if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER && (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) { struct hci_dev *hdev = conn->hdev; struct hci_cp_read_clock_offset clkoff_cp; clkoff_cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(clkoff_cp), &clkoff_cp); } return hci_abort_conn(conn, reason); } static void hci_add_sco(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_add_sco cp; BT_DBG("hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; cp.handle = cpu_to_le16(handle); cp.pkt_type = cpu_to_le16(conn->pkt_type); hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp); } static bool find_next_esco_param(struct hci_conn *conn, const struct sco_param *esco_param, int size) { if (!conn->parent) return false; for (; conn->attempt <= size; conn->attempt++) { if (lmp_esco_2m_capable(conn->parent) || (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3)) break; BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported", conn, conn->attempt); } return conn->attempt <= size; } static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec) { int err; __u8 vnd_len, *vnd_data = NULL; struct hci_op_configure_data_path *cmd = NULL; /* Do not take below 2 checks as error since the 1st means user do not * want to use HFP offload mode and the 2nd means the vendor controller * do not need to send below HCI command for offload mode. */ if (!codec->data_path || !hdev->get_codec_config_data) return 0; err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, &vnd_data); if (err < 0) goto error; cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL); if (!cmd) { err = -ENOMEM; goto error; } err = hdev->get_data_path_id(hdev, &cmd->data_path_id); if (err < 0) goto error; cmd->vnd_len = vnd_len; memcpy(cmd->vnd_data, vnd_data, vnd_len); cmd->direction = 0x00; __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT); cmd->direction = 0x01; err = __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT); error: kfree(cmd); kfree(vnd_data); return err; } static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) { struct conn_handle_t *conn_handle = data; struct hci_conn *conn = conn_handle->conn; __u16 handle = conn_handle->handle; struct hci_cp_enhanced_setup_sync_conn cp; const struct sco_param *param; kfree(conn_handle); if (!hci_conn_valid(hdev, conn)) return -ECANCELED; bt_dev_dbg(hdev, "hcon %p", conn); configure_datapath_sync(hdev, &conn->codec); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; memset(&cp, 0x00, sizeof(cp)); cp.handle = cpu_to_le16(handle); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); switch (conn->codec.id) { case BT_CODEC_MSBC: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) return -EINVAL; param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x05; cp.rx_coding_format.id = 0x05; cp.tx_codec_frame_size = __cpu_to_le16(60); cp.rx_codec_frame_size = __cpu_to_le16(60); cp.in_bandwidth = __cpu_to_le32(32000); cp.out_bandwidth = __cpu_to_le32(32000); cp.in_coding_format.id = 0x04; cp.out_coding_format.id = 0x04; cp.in_coded_data_size = __cpu_to_le16(16); cp.out_coded_data_size = __cpu_to_le16(16); cp.in_pcm_data_format = 2; cp.out_pcm_data_format = 2; cp.in_pcm_sample_payload_msb_pos = 0; cp.out_pcm_sample_payload_msb_pos = 0; cp.in_data_path = conn->codec.data_path; cp.out_data_path = conn->codec.data_path; cp.in_transport_unit_size = 1; cp.out_transport_unit_size = 1; break; case BT_CODEC_TRANSPARENT: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) return -EINVAL; param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x03; cp.rx_coding_format.id = 0x03; cp.tx_codec_frame_size = __cpu_to_le16(60); cp.rx_codec_frame_size = __cpu_to_le16(60); cp.in_bandwidth = __cpu_to_le32(0x1f40); cp.out_bandwidth = __cpu_to_le32(0x1f40); cp.in_coding_format.id = 0x03; cp.out_coding_format.id = 0x03; cp.in_coded_data_size = __cpu_to_le16(16); cp.out_coded_data_size = __cpu_to_le16(16); cp.in_pcm_data_format = 2; cp.out_pcm_data_format = 2; cp.in_pcm_sample_payload_msb_pos = 0; cp.out_pcm_sample_payload_msb_pos = 0; cp.in_data_path = conn->codec.data_path; cp.out_data_path = conn->codec.data_path; cp.in_transport_unit_size = 1; cp.out_transport_unit_size = 1; break; case BT_CODEC_CVSD: if (conn->parent && lmp_esco_capable(conn->parent)) { if (!find_next_esco_param(conn, esco_param_cvsd, ARRAY_SIZE(esco_param_cvsd))) return -EINVAL; param = &esco_param_cvsd[conn->attempt - 1]; } else { if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) return -EINVAL; param = &sco_param_cvsd[conn->attempt - 1]; } cp.tx_coding_format.id = 2; cp.rx_coding_format.id = 2; cp.tx_codec_frame_size = __cpu_to_le16(60); cp.rx_codec_frame_size = __cpu_to_le16(60); cp.in_bandwidth = __cpu_to_le32(16000); cp.out_bandwidth = __cpu_to_le32(16000); cp.in_coding_format.id = 4; cp.out_coding_format.id = 4; cp.in_coded_data_size = __cpu_to_le16(16); cp.out_coded_data_size = __cpu_to_le16(16); cp.in_pcm_data_format = 2; cp.out_pcm_data_format = 2; cp.in_pcm_sample_payload_msb_pos = 0; cp.out_pcm_sample_payload_msb_pos = 0; cp.in_data_path = conn->codec.data_path; cp.out_data_path = conn->codec.data_path; cp.in_transport_unit_size = 16; cp.out_transport_unit_size = 16; break; default: return -EINVAL; } cp.retrans_effort = param->retrans_effort; cp.pkt_type = __cpu_to_le16(param->pkt_type); cp.max_latency = __cpu_to_le16(param->max_latency); if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) return -EIO; return 0; } static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_setup_sync_conn cp; const struct sco_param *param; bt_dev_dbg(hdev, "hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; cp.handle = cpu_to_le16(handle); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.voice_setting = cpu_to_le16(conn->setting); switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) return false; param = &esco_param_msbc[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: if (conn->parent && lmp_esco_capable(conn->parent)) { if (!find_next_esco_param(conn, esco_param_cvsd, ARRAY_SIZE(esco_param_cvsd))) return false; param = &esco_param_cvsd[conn->attempt - 1]; } else { if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) return false; param = &sco_param_cvsd[conn->attempt - 1]; } break; default: return false; } cp.retrans_effort = param->retrans_effort; cp.pkt_type = __cpu_to_le16(param->pkt_type); cp.max_latency = __cpu_to_le16(param->max_latency); if (hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) return false; return true; } bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { int result; struct conn_handle_t *conn_handle; if (enhanced_sync_conn_capable(conn->hdev)) { conn_handle = kzalloc(sizeof(*conn_handle), GFP_KERNEL); if (!conn_handle) return false; conn_handle->conn = conn; conn_handle->handle = handle; result = hci_cmd_sync_queue(conn->hdev, hci_enhanced_setup_sync, conn_handle, NULL); if (result < 0) kfree(conn_handle); return result == 0; } return hci_setup_sync_conn(conn, handle); } u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier) { struct hci_dev *hdev = conn->hdev; struct hci_conn_params *params; struct hci_cp_le_conn_update cp; hci_dev_lock(hdev); params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { params->conn_min_interval = min; params->conn_max_interval = max; params->conn_latency = latency; params->supervision_timeout = to_multiplier; } hci_dev_unlock(hdev); memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.conn_interval_min = cpu_to_le16(min); cp.conn_interval_max = cpu_to_le16(max); cp.conn_latency = cpu_to_le16(latency); cp.supervision_timeout = cpu_to_le16(to_multiplier); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); if (params) return 0x01; return 0x00; } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16], __u8 key_size) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_start_enc cp; BT_DBG("hcon %p", conn); memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.rand = rand; cp.ediv = ediv; memcpy(cp.ltk, ltk, key_size); hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); } /* Device _must_ be locked */ void hci_sco_setup(struct hci_conn *conn, __u8 status) { struct hci_link *link; link = list_first_entry_or_null(&conn->link_list, struct hci_link, list); if (!link || !link->conn) return; BT_DBG("hcon %p", conn); if (!status) { if (lmp_esco_capable(conn->hdev)) hci_setup_sync(link->conn, conn->handle); else hci_add_sco(link->conn, conn->handle); } else { hci_connect_cfm(link->conn, status); hci_conn_del(link->conn); } } static void hci_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, disc_work.work); int refcnt = atomic_read(&conn->refcnt); BT_DBG("hcon %p state %s", conn, state_to_string(conn->state)); WARN_ON(refcnt < 0); /* FIXME: It was observed that in pairing failed scenario, refcnt * drops below 0. Probably this is because l2cap_conn_del calls * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is * dropped. After that loop hci_chan_del is called which also drops * conn. For now make sure that ACL is alive if refcnt is higher then 0, * otherwise drop it. */ if (refcnt > 0) return; hci_abort_conn(conn, hci_proto_disconn_ind(conn)); } /* Enter sniff mode */ static void hci_conn_idle(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, idle_work.work); struct hci_dev *hdev = conn->hdev; BT_DBG("hcon %p mode %d", conn, conn->mode); if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn)) return; if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF)) return; if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) { struct hci_cp_sniff_subrate cp; cp.handle = cpu_to_le16(conn->handle); cp.max_latency = cpu_to_le16(0); cp.min_remote_timeout = cpu_to_le16(0); cp.min_local_timeout = cpu_to_le16(0); hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp); } if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) { struct hci_cp_sniff_mode cp; cp.handle = cpu_to_le16(conn->handle); cp.max_interval = cpu_to_le16(hdev->sniff_max_interval); cp.min_interval = cpu_to_le16(hdev->sniff_min_interval); cp.attempt = cpu_to_le16(4); cp.timeout = cpu_to_le16(1); hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp); } } static void hci_conn_auto_accept(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, auto_accept_work.work); hci_send_cmd(conn->hdev, HCI_OP_USER_CONFIRM_REPLY, sizeof(conn->dst), &conn->dst); } static void le_disable_advertising(struct hci_dev *hdev) { if (ext_adv_capable(hdev)) { struct hci_cp_le_set_ext_adv_enable cp; cp.enable = 0x00; cp.num_of_sets = 0x00; hci_send_cmd(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp); } else { u8 enable = 0x00; hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } } static void le_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, le_conn_timeout.work); struct hci_dev *hdev = conn->hdev; BT_DBG(""); /* We could end up here due to having done directed advertising, * so clean up the state if necessary. This should however only * happen with broken hardware or if low duty cycle was used * (which doesn't have a timeout of its own). */ if (conn->role == HCI_ROLE_SLAVE) { /* Disable LE Advertising */ le_disable_advertising(hdev); hci_dev_lock(hdev); hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); hci_dev_unlock(hdev); return; } hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); } struct iso_list_data { union { u8 cig; u8 big; }; union { u8 cis; u8 bis; u16 sync_handle; }; int count; bool big_term; bool pa_sync_term; bool big_sync_term; }; static void bis_list(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Skip if not broadcast/ANY address */ if (bacmp(&conn->dst, BDADDR_ANY)) return; if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET || d->bis != conn->iso_qos.bcast.bis) return; d->count++; } static int terminate_big_sync(struct hci_dev *hdev, void *data) { struct iso_list_data *d = data; bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", d->big, d->bis); hci_disable_per_advertising_sync(hdev, d->bis); hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL); /* Only terminate BIG if it has been created */ if (!d->big_term) return 0; return hci_le_terminate_big_sync(hdev, d->big, HCI_ERROR_LOCAL_HOST_TERM); } static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err) { kfree(data); } static int hci_le_terminate_big(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", conn->iso_qos.bcast.big, conn->iso_qos.bcast.bis); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; d->big = conn->iso_qos.bcast.big; d->bis = conn->iso_qos.bcast.bis; d->big_term = test_and_clear_bit(HCI_CONN_BIG_CREATED, &conn->flags); ret = hci_cmd_sync_queue(hdev, terminate_big_sync, d, terminate_big_destroy); if (ret) kfree(d); return ret; } static int big_terminate_sync(struct hci_dev *hdev, void *data) { struct iso_list_data *d = data; bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", d->big, d->sync_handle); if (d->big_sync_term) hci_le_big_terminate_sync(hdev, d->big); if (d->pa_sync_term) return hci_le_pa_terminate_sync(hdev, d->sync_handle); return 0; } static void find_bis(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Ignore if BIG doesn't match */ if (d->big != conn->iso_qos.bcast.big) return; d->count++; } static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn, conn->iso_qos.bcast.big, conn->sync_handle); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; d->big = conn->iso_qos.bcast.big; d->sync_handle = conn->sync_handle; if (conn->type == PA_LINK && test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { hci_conn_hash_list_flag(hdev, find_bis, PA_LINK, HCI_CONN_PA_SYNC, d); if (!d->count) d->pa_sync_term = true; d->count = 0; } if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) { hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK, HCI_CONN_BIG_SYNC, d); if (!d->count) d->big_sync_term = true; } if (!d->pa_sync_term && !d->big_sync_term) return 0; ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); if (ret) kfree(d); return ret; } /* Cleanup BIS connection * * Detects if there any BIS left connected in a BIG * broadcaster: Remove advertising instance and terminate BIG. * broadcaster receiver: Terminate BIG sync and terminate PA sync. */ static void bis_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct hci_conn *bis; bt_dev_dbg(hdev, "conn %p", conn); if (conn->role == HCI_ROLE_MASTER) { if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags)) return; /* Check if ISO connection is a BIS and terminate advertising * set and BIG if there are no other connections using it. */ bis = hci_conn_hash_lookup_big_state(hdev, conn->iso_qos.bcast.big, BT_CONNECTED, HCI_ROLE_MASTER); if (bis) return; bis = hci_conn_hash_lookup_big_state(hdev, conn->iso_qos.bcast.big, BT_CONNECT, HCI_ROLE_MASTER); if (bis) return; bis = hci_conn_hash_lookup_big_state(hdev, conn->iso_qos.bcast.big, BT_OPEN, HCI_ROLE_MASTER); if (bis) return; hci_le_terminate_big(hdev, conn); } else { hci_le_big_terminate(hdev, conn); } } static int remove_cig_sync(struct hci_dev *hdev, void *data) { u8 handle = PTR_UINT(data); return hci_le_remove_cig_sync(hdev, handle); } static int hci_le_remove_cig(struct hci_dev *hdev, u8 handle) { bt_dev_dbg(hdev, "handle 0x%2.2x", handle); return hci_cmd_sync_queue(hdev, remove_cig_sync, UINT_PTR(handle), NULL); } static void find_cis(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Ignore broadcast or if CIG don't match */ if (!bacmp(&conn->dst, BDADDR_ANY) || d->cig != conn->iso_qos.ucast.cig) return; d->count++; } /* Cleanup CIS connection: * * Detects if there any CIS left connected in a CIG and remove it. */ static void cis_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct iso_list_data d; if (conn->iso_qos.ucast.cig == BT_ISO_QOS_CIG_UNSET) return; memset(&d, 0, sizeof(d)); d.cig = conn->iso_qos.ucast.cig; /* Check if ISO connection is a CIS and remove CIG if there are * no other connections using it. */ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_BOUND, &d); hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, &d); hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, &d); if (d.count) return; hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig); } static int hci_conn_hash_alloc_unset(struct hci_dev *hdev) { return ida_alloc_range(&hdev->unset_handle_ida, HCI_CONN_HANDLE_MAX + 1, U16_MAX, GFP_ATOMIC); } static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role, u16 handle) { struct hci_conn *conn; switch (type) { case ACL_LINK: if (!hdev->acl_mtu) return ERR_PTR(-ECONNREFUSED); break; case CIS_LINK: case BIS_LINK: case PA_LINK: if (!hdev->iso_mtu) return ERR_PTR(-ECONNREFUSED); break; case LE_LINK: if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); break; case SCO_LINK: case ESCO_LINK: if (!hdev->sco_pkts) /* Controller does not support SCO or eSCO over HCI */ return ERR_PTR(-ECONNREFUSED); break; default: return ERR_PTR(-ECONNREFUSED); } bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle); conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) return ERR_PTR(-ENOMEM); bacpy(&conn->dst, dst); bacpy(&conn->src, &hdev->bdaddr); conn->handle = handle; conn->hdev = hdev; conn->type = type; conn->role = role; conn->mode = HCI_CM_ACTIVE; conn->state = BT_OPEN; conn->auth_type = HCI_AT_GENERAL_BONDING; conn->io_capability = hdev->io_capability; conn->remote_auth = 0xff; conn->key_type = 0xff; conn->rssi = HCI_RSSI_INVALID; conn->tx_power = HCI_TX_POWER_INVALID; conn->max_tx_power = HCI_TX_POWER_INVALID; conn->sync_handle = HCI_SYNC_HANDLE_INVALID; conn->sid = HCI_SID_INVALID; set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; /* Set Default Authenticated payload timeout to 30s */ conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; if (conn->role == HCI_ROLE_MASTER) conn->out = true; switch (type) { case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; conn->mtu = hdev->acl_mtu; break; case LE_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case CIS_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); if (conn->role == HCI_ROLE_MASTER) conn->cleanup = cis_cleanup; conn->mtu = hdev->iso_mtu; break; case PA_LINK: case BIS_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->cleanup = bis_cleanup; conn->mtu = hdev->iso_mtu; break; case SCO_LINK: if (lmp_esco_capable(hdev)) conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | (hdev->esco_type & EDR_ESCO_MASK); else conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK; conn->mtu = hdev->sco_mtu; break; case ESCO_LINK: conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK; conn->mtu = hdev->sco_mtu; break; } skb_queue_head_init(&conn->data_q); skb_queue_head_init(&conn->tx_q.queue); INIT_LIST_HEAD(&conn->chan_list); INIT_LIST_HEAD(&conn->link_list); INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout); INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept); INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle); INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout); atomic_set(&conn->refcnt, 0); hci_dev_hold(hdev); hci_conn_hash_add(hdev, conn); /* The SCO and eSCO connections will only be notified when their * setup has been completed. This is different to ACL links which * can be notified right away. */ if (conn->type != SCO_LINK && conn->type != ESCO_LINK) { if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); } hci_conn_init_sysfs(conn); return conn; } struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role) { int handle; bt_dev_dbg(hdev, "dst %pMR", dst); handle = hci_conn_hash_alloc_unset(hdev); if (unlikely(handle < 0)) return ERR_PTR(-ECONNREFUSED); return __hci_conn_add(hdev, type, dst, role, handle); } struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role, u16 handle) { if (handle > HCI_CONN_HANDLE_MAX) return ERR_PTR(-EINVAL); return __hci_conn_add(hdev, type, dst, role, handle); } static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) { if (!reason) reason = HCI_ERROR_REMOTE_USER_TERM; /* Due to race, SCO/ISO conn might be not established yet at this point, * and nothing else will clean it up. In other cases it is done via HCI * events. */ switch (conn->type) { case SCO_LINK: case ESCO_LINK: if (HCI_CONN_HANDLE_UNSET(conn->handle)) hci_conn_failed(conn, reason); break; case CIS_LINK: case BIS_LINK: case PA_LINK: if ((conn->state != BT_CONNECTED && !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) || test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) hci_conn_failed(conn, reason); break; } } static void hci_conn_unlink(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "hcon %p", conn); if (!conn->parent) { struct hci_link *link, *t; list_for_each_entry_safe(link, t, &conn->link_list, list) { struct hci_conn *child = link->conn; hci_conn_unlink(child); /* If hdev is down it means * hci_dev_close_sync/hci_conn_hash_flush is in progress * and links don't need to be cleanup as all connections * would be cleanup. */ if (!test_bit(HCI_UP, &hdev->flags)) continue; hci_conn_cleanup_child(child, conn->abort_reason); } return; } if (!conn->link) return; list_del_rcu(&conn->link->list); synchronize_rcu(); hci_conn_drop(conn->parent); hci_conn_put(conn->parent); conn->parent = NULL; kfree(conn->link); conn->link = NULL; } void hci_conn_del(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s hcon %p handle %d", hdev->name, conn, conn->handle); hci_conn_unlink(conn); disable_delayed_work_sync(&conn->disc_work); disable_delayed_work_sync(&conn->auto_accept_work); disable_delayed_work_sync(&conn->idle_work); /* Remove the connection from the list so unacked logic can detect when * a certain pool is not being utilized. */ hci_conn_hash_del(hdev, conn); /* Handle unacked frames: * * - In case there are no connection, or if restoring the buffers * considered in transist would overflow, restore all buffers to the * pool. * - Otherwise restore just the buffers considered in transit for the * hci_conn */ switch (conn->type) { case ACL_LINK: if (!hci_conn_num(hdev, ACL_LINK) || hdev->acl_cnt + conn->sent > hdev->acl_pkts) hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; break; case LE_LINK: cancel_delayed_work(&conn->le_conn_timeout); if (hdev->le_pkts) { if (!hci_conn_num(hdev, LE_LINK) || hdev->le_cnt + conn->sent > hdev->le_pkts) hdev->le_cnt = hdev->le_pkts; else hdev->le_cnt += conn->sent; } else { if ((!hci_conn_num(hdev, LE_LINK) && !hci_conn_num(hdev, ACL_LINK)) || hdev->acl_cnt + conn->sent > hdev->acl_pkts) hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; } break; case CIS_LINK: case BIS_LINK: case PA_LINK: if (!hci_iso_count(hdev) || hdev->iso_cnt + conn->sent > hdev->iso_pkts) hdev->iso_cnt = hdev->iso_pkts; else hdev->iso_cnt += conn->sent; break; } skb_queue_purge(&conn->data_q); skb_queue_purge(&conn->tx_q.queue); /* Remove the connection from the list and cleanup its remaining * state. This is a separate function since for some cases like * BT_CONNECT_SCAN we *only* want the cleanup part without the * rest of hci_conn_del. */ hci_conn_cleanup(conn); /* Dequeue callbacks using connection pointer as data */ hci_cmd_sync_dequeue(hdev, NULL, conn, NULL); } struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) { int use_src = bacmp(src, BDADDR_ANY); struct hci_dev *hdev = NULL, *d; BT_DBG("%pMR -> %pMR", src, dst); read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Simple routing: * No source address - find interface with bdaddr != dst * Source address - find interface with bdaddr == src */ if (use_src) { bdaddr_t id_addr; u8 id_addr_type; if (src_type == BDADDR_BREDR) { if (!lmp_bredr_capable(d)) continue; bacpy(&id_addr, &d->bdaddr); id_addr_type = BDADDR_BREDR; } else { if (!lmp_le_capable(d)) continue; hci_copy_identity_address(d, &id_addr, &id_addr_type); /* Convert from HCI to three-value type */ if (id_addr_type == ADDR_LE_DEV_PUBLIC) id_addr_type = BDADDR_LE_PUBLIC; else id_addr_type = BDADDR_LE_RANDOM; } if (!bacmp(&id_addr, src) && id_addr_type == src_type) { hdev = d; break; } } else { if (bacmp(&d->bdaddr, dst)) { hdev = d; break; } } } if (hdev) hdev = hci_dev_hold(hdev); read_unlock(&hci_dev_list_lock); return hdev; } EXPORT_SYMBOL(hci_get_route); /* This function requires the caller holds hdev->lock */ static void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; hci_connect_le_scan_cleanup(conn, status); /* Enable advertising in case this was a failed connection * attempt as a peripheral. */ hci_enable_advertising(hdev); } /* This function requires the caller holds hdev->lock */ void hci_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "status 0x%2.2x", status); switch (conn->type) { case LE_LINK: hci_le_conn_failed(conn, status); break; case ACL_LINK: mgmt_connect_failed(hdev, conn, status); break; } /* In case of BIG/PA sync failed, clear conn flags so that * the conns will be correctly cleaned up by ISO layer */ test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags); test_and_clear_bit(HCI_CONN_PA_SYNC_FAILED, &conn->flags); conn->state = BT_CLOSED; hci_connect_cfm(conn, status); hci_conn_del(conn); } /* This function requires the caller holds hdev->lock */ u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "hcon %p handle 0x%4.4x", conn, handle); if (conn->handle == handle) return 0; if (handle > HCI_CONN_HANDLE_MAX) { bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", handle, HCI_CONN_HANDLE_MAX); return HCI_ERROR_INVALID_PARAMETERS; } /* If abort_reason has been sent it means the connection is being * aborted and the handle shall not be changed. */ if (conn->abort_reason) return conn->abort_reason; if (HCI_CONN_HANDLE_UNSET(conn->handle)) ida_free(&hdev->unset_handle_ida, conn->handle); conn->handle = handle; return 0; } struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, bool dst_resolved, u8 sec_level, u16 conn_timeout, u8 role, u8 phy, u8 sec_phy) { struct hci_conn *conn; struct smp_irk *irk; int err; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Since the controller supports only one LE connection attempt at a * time, we return -EBUSY if there is any connection attempt running. */ if (hci_lookup_le_connect(hdev)) return ERR_PTR(-EBUSY); /* If there's already a connection object but it's not in * scanning state it means it must already be established, in * which case we can't do anything else except report a failure * to connect. */ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); if (conn && !test_bit(HCI_CONN_SCANNING, &conn->flags)) { return ERR_PTR(-EBUSY); } /* Check if the destination address has been resolved by the controller * since if it did then the identity address shall be used. */ if (!dst_resolved) { /* When given an identity address with existing identity * resolving key, the connection needs to be established * to a resolvable random address. * * Storing the resolvable random address is required here * to handle connection failures. The address will later * be resolved back into the original identity address * from the connect request. */ irk = hci_find_irk_by_addr(hdev, dst, dst_type); if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { dst = &irk->rpa; dst_type = ADDR_LE_DEV_RANDOM; } } if (conn) { bacpy(&conn->dst, dst); } else { conn = hci_conn_add_unset(hdev, LE_LINK, dst, role); if (IS_ERR(conn)) return conn; hci_conn_hold(conn); conn->pending_sec_level = sec_level; } conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; conn->le_adv_phy = phy; conn->le_adv_sec_phy = sec_phy; err = hci_connect_le_sync(hdev, conn); if (err) { hci_conn_del(conn); return ERR_PTR(err); } return conn; } static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) { struct hci_conn *conn; conn = hci_conn_hash_lookup_le(hdev, addr, type); if (!conn) return false; if (conn->state != BT_CONNECTED) return false; return true; } /* This function requires the caller holds hdev->lock */ static int hci_explicit_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; if (is_connected(hdev, addr, addr_type)) return -EISCONN; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) { params = hci_conn_params_add(hdev, addr, addr_type); if (!params) return -ENOMEM; /* If we created new params, mark them to be deleted in * hci_connect_le_scan_cleanup. It's different case than * existing disabled params, those will stay after cleanup. */ params->auto_connect = HCI_AUTO_CONN_EXPLICIT; } /* We're trying to connect, so make sure params are at pend_le_conns */ if (params->auto_connect == HCI_AUTO_CONN_DISABLED || params->auto_connect == HCI_AUTO_CONN_REPORT || params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { hci_pend_le_list_del_init(params); hci_pend_le_list_add(params, &hdev->pend_le_conns); } params->explicit_connect = true; BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, params->auto_connect); return 0; } static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos) { struct hci_conn *conn; u8 big; /* Allocate a BIG if not set */ if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) { for (big = 0x00; big < 0xef; big++) { conn = hci_conn_hash_lookup_big(hdev, big); if (!conn) break; } if (big == 0xef) return -EADDRNOTAVAIL; /* Update BIG */ qos->bcast.big = big; } return 0; } static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) { struct hci_conn *conn; u8 bis; /* Allocate BIS if not set */ if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) { if (qos->bcast.big != BT_ISO_QOS_BIG_UNSET) { conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big); if (conn) { /* If the BIG handle is already matched to an advertising * handle, do not allocate a new one. */ qos->bcast.bis = conn->iso_qos.bcast.bis; return 0; } } /* Find an unused adv set to advertise BIS, skip instance 0x00 * since it is reserved as general purpose set. */ for (bis = 0x01; bis < hdev->le_num_of_adv_sets; bis++) { conn = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, bis); if (!conn) break; } if (bis == hdev->le_num_of_adv_sets) return -EADDRNOTAVAIL; /* Update BIS */ qos->bcast.bis = bis; } return 0; } /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } err = qos_set_big(hdev, qos); if (err) return ERR_PTR(err); err = qos_set_bis(hdev, qos); if (err) return ERR_PTR(err); /* Check if the LE Create BIG command has already been sent */ conn = hci_conn_hash_lookup_per_adv_bis(hdev, dst, qos->bcast.big, qos->bcast.big); if (conn) return ERR_PTR(-EADDRINUSE); /* Check BIS settings against other bound BISes, since all * BISes in a BIG must have the same value for all parameters */ conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big); if (conn && (memcmp(qos, &conn->iso_qos, sizeof(*qos)) || base_len != conn->le_per_adv_data_len || memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; conn->state = BT_CONNECT; conn->sid = sid; conn->conn_timeout = timeout; hci_conn_hold(conn); return conn; } /* This function requires the caller holds hdev->lock */ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, enum conn_reasons conn_reason) { struct hci_conn *conn; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Some devices send ATT messages as soon as the physical link is * established. To be able to handle these ATT messages, the user- * space first establishes the connection and then starts the pairing * process. * * So if a hci_conn object already exists for the following connection * attempt, we simply update pending_sec_level and auth_type fields * and return the object found. */ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); if (conn) { if (conn->pending_sec_level < sec_level) conn->pending_sec_level = sec_level; goto done; } BT_DBG("requesting refresh of dst_addr"); conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) { hci_conn_del(conn); return ERR_PTR(-EBUSY); } conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; conn->conn_reason = conn_reason; hci_update_passive_scan(hdev); done: hci_conn_hold(conn); return conn; } struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type, enum conn_reasons conn_reason, u16 timeout) { struct hci_conn *acl; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { if (lmp_bredr_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Reject outgoing connection to device with same BD ADDR against * CVE-2020-26555 */ if (!bacmp(&hdev->bdaddr, dst)) { bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n", dst); return ERR_PTR(-ECONNREFUSED); } acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(acl)) return acl; } hci_conn_hold(acl); acl->conn_reason = conn_reason; if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { int err; acl->sec_level = BT_SECURITY_LOW; acl->pending_sec_level = sec_level; acl->auth_type = auth_type; acl->conn_timeout = timeout; err = hci_connect_acl_sync(hdev, acl); if (err) { hci_conn_del(acl); return ERR_PTR(err); } } return acl; } static struct hci_link *hci_conn_link(struct hci_conn *parent, struct hci_conn *conn) { struct hci_dev *hdev = parent->hdev; struct hci_link *link; bt_dev_dbg(hdev, "parent %p hcon %p", parent, conn); if (conn->link) return conn->link; if (conn->parent) return NULL; link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) return NULL; link->conn = hci_conn_hold(conn); conn->link = link; conn->parent = hci_conn_get(parent); /* Use list_add_tail_rcu append to the list */ list_add_tail_rcu(&link->list, &parent->link_list); return link; } struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec, u16 timeout) { struct hci_conn *acl; struct hci_conn *sco; struct hci_link *link; acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING, CONN_REASON_SCO_CONNECT, timeout); if (IS_ERR(acl)) return acl; sco = hci_conn_hash_lookup_ba(hdev, type, dst); if (!sco) { sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER); if (IS_ERR(sco)) { hci_conn_drop(acl); return sco; } } link = hci_conn_link(acl, sco); if (!link) { hci_conn_drop(acl); hci_conn_drop(sco); return ERR_PTR(-ENOLINK); } sco->setting = setting; sco->codec = *codec; if (acl->state == BT_CONNECTED && (sco->state == BT_OPEN || sco->state == BT_CLOSED)) { set_bit(HCI_CONN_POWER_SAVE, &acl->flags); hci_conn_enter_active_mode(acl, BT_POWER_FORCE_ACTIVE_ON); if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->flags)) { /* defer SCO setup until mode change completed */ set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->flags); return sco; } hci_sco_setup(acl, 0x00); } return sco; } static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_create_big cp; struct iso_list_data data; memset(&cp, 0, sizeof(cp)); data.big = qos->bcast.big; data.bis = qos->bcast.bis; data.count = 0; /* Create a BIS for each bound connection */ hci_conn_hash_list_state(hdev, bis_list, BIS_LINK, BT_BOUND, &data); cp.handle = qos->bcast.big; cp.adv_handle = qos->bcast.bis; cp.num_bis = data.count; hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval); cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu); cp.bis.latency = cpu_to_le16(qos->bcast.out.latency); cp.bis.rtn = qos->bcast.out.rtn; cp.bis.phy = qos->bcast.out.phy; cp.bis.packing = qos->bcast.packing; cp.bis.framing = qos->bcast.framing; cp.bis.encryption = qos->bcast.encryption; memcpy(cp.bis.bcode, qos->bcast.bcode, sizeof(cp.bis.bcode)); return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp); } static int set_cig_params_sync(struct hci_dev *hdev, void *data) { DEFINE_FLEX(struct hci_cp_le_set_cig_params, pdu, cis, num_cis, 0x1f); u8 cig_id = PTR_UINT(data); struct hci_conn *conn; struct bt_iso_qos *qos; u8 aux_num_cis = 0; u8 cis_id; conn = hci_conn_hash_lookup_cig(hdev, cig_id); if (!conn) return 0; qos = &conn->iso_qos; pdu->cig_id = cig_id; hci_cpu_to_le24(qos->ucast.out.interval, pdu->c_interval); hci_cpu_to_le24(qos->ucast.in.interval, pdu->p_interval); pdu->sca = qos->ucast.sca; pdu->packing = qos->ucast.packing; pdu->framing = qos->ucast.framing; pdu->c_latency = cpu_to_le16(qos->ucast.out.latency); pdu->p_latency = cpu_to_le16(qos->ucast.in.latency); /* Reprogram all CIS(s) with the same CIG, valid range are: * num_cis: 0x00 to 0x1F * cis_id: 0x00 to 0xEF */ for (cis_id = 0x00; cis_id < 0xf0 && aux_num_cis < pdu->num_cis; cis_id++) { struct hci_cis_params *cis; conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id); if (!conn) continue; qos = &conn->iso_qos; cis = &pdu->cis[aux_num_cis++]; cis->cis_id = cis_id; cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu); cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu); cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : qos->ucast.in.phy; cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : qos->ucast.out.phy; cis->c_rtn = qos->ucast.out.rtn; cis->p_rtn = qos->ucast.in.rtn; } pdu->num_cis = aux_num_cis; if (!pdu->num_cis) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS, struct_size(pdu, cis, pdu->num_cis), pdu, HCI_CMD_TIMEOUT); } static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct iso_list_data data; memset(&data, 0, sizeof(data)); /* Allocate first still reconfigurable CIG if not set */ if (qos->ucast.cig == BT_ISO_QOS_CIG_UNSET) { for (data.cig = 0x00; data.cig < 0xf0; data.cig++) { data.count = 0; hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, &data); if (data.count) continue; hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, &data); if (!data.count) break; } if (data.cig == 0xf0) return false; /* Update CIG */ qos->ucast.cig = data.cig; } if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) { if (hci_conn_hash_lookup_cis(hdev, NULL, 0, qos->ucast.cig, qos->ucast.cis)) return false; goto done; } /* Allocate first available CIS if not set */ for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0; data.cis++) { if (!hci_conn_hash_lookup_cis(hdev, NULL, 0, data.cig, data.cis)) { /* Update CIS */ qos->ucast.cis = data.cis; break; } } if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) return false; done: if (hci_cmd_sync_queue(hdev, set_cig_params_sync, UINT_PTR(qos->ucast.cig), NULL) < 0) return false; return true; } struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos, u16 timeout) { struct hci_conn *cis; cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig, qos->ucast.cis); if (!cis) { cis = hci_conn_add_unset(hdev, CIS_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(cis)) return cis; cis->cleanup = cis_cleanup; cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; cis->conn_timeout = timeout; } if (cis->state == BT_CONNECTED) return cis; /* Check if CIS has been set and the settings matches */ if (cis->state == BT_BOUND && !memcmp(&cis->iso_qos, qos, sizeof(*qos))) return cis; /* Update LINK PHYs according to QoS preference */ cis->le_tx_phy = qos->ucast.out.phy; cis->le_rx_phy = qos->ucast.in.phy; /* If output interval is not set use the input interval as it cannot be * 0x000000. */ if (!qos->ucast.out.interval) qos->ucast.out.interval = qos->ucast.in.interval; /* If input interval is not set use the output interval as it cannot be * 0x000000. */ if (!qos->ucast.in.interval) qos->ucast.in.interval = qos->ucast.out.interval; /* If output latency is not set use the input latency as it cannot be * 0x0000. */ if (!qos->ucast.out.latency) qos->ucast.out.latency = qos->ucast.in.latency; /* If input latency is not set use the output latency as it cannot be * 0x0000. */ if (!qos->ucast.in.latency) qos->ucast.in.latency = qos->ucast.out.latency; if (!hci_le_set_cig_params(cis, qos)) { hci_conn_drop(cis); return ERR_PTR(-EINVAL); } hci_conn_hold(cis); cis->iso_qos = *qos; cis->state = BT_BOUND; return cis; } bool hci_iso_setup_path(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_setup_iso_path cmd; memset(&cmd, 0, sizeof(cmd)); if (conn->iso_qos.ucast.out.sdu) { cmd.handle = cpu_to_le16(conn->handle); cmd.direction = 0x00; /* Input (Host to Controller) */ cmd.path = 0x00; /* HCI path if enabled */ cmd.codec = 0x03; /* Transparent Data */ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd), &cmd) < 0) return false; } if (conn->iso_qos.ucast.in.sdu) { cmd.handle = cpu_to_le16(conn->handle); cmd.direction = 0x01; /* Output (Controller to Host) */ cmd.path = 0x00; /* HCI path if enabled */ cmd.codec = 0x03; /* Transparent Data */ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd), &cmd) < 0) return false; } return true; } int hci_conn_check_create_cis(struct hci_conn *conn) { if (conn->type != CIS_LINK) return -EINVAL; if (!conn->parent || conn->parent->state != BT_CONNECTED || conn->state != BT_CONNECT || HCI_CONN_HANDLE_UNSET(conn->handle)) return 1; return 0; } static int hci_create_cis_sync(struct hci_dev *hdev, void *data) { return hci_le_create_cis_sync(hdev); } int hci_le_create_cis_pending(struct hci_dev *hdev) { struct hci_conn *conn; bool pending = false; rcu_read_lock(); list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) { rcu_read_unlock(); return -EBUSY; } if (!hci_conn_check_create_cis(conn)) pending = true; } rcu_read_unlock(); if (!pending) return 0; /* Queue Create CIS */ return hci_cmd_sync_queue(hdev, hci_create_cis_sync, NULL, NULL); } static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, struct bt_iso_io_qos *qos, __u8 phy) { /* Only set MTU if PHY is enabled */ if (!qos->sdu && qos->phy) qos->sdu = conn->mtu; /* Use the same PHY as ACL if set to any */ if (qos->phy == BT_ISO_PHY_ANY) qos->phy = phy; /* Use LE ACL connection interval if not set */ if (!qos->interval) /* ACL interval unit in 1.25 ms to us */ qos->interval = conn->le_conn_interval * 1250; /* Use LE ACL connection latency if not set */ if (!qos->latency) qos->latency = conn->le_conn_latency; } static int create_big_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; struct bt_iso_qos *qos = &conn->iso_qos; u16 interval, sync_interval = 0; u32 flags = 0; int err; if (qos->bcast.out.phy == 0x02) flags |= MGMT_ADV_FLAG_SEC_2M; /* Align intervals */ interval = (qos->bcast.out.interval / 1250) * qos->bcast.sync_factor; if (qos->bcast.bis) sync_interval = interval * 4; err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->sid, conn->le_per_adv_data_len, conn->le_per_adv_data, flags, interval, interval, sync_interval); if (err) return err; return hci_le_create_big(conn, &conn->iso_qos); } struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos) { struct hci_conn *conn; bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid); conn = hci_conn_add_unset(hdev, PA_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; conn->iso_qos = *qos; conn->dst_type = dst_type; conn->sid = sid; conn->state = BT_LISTEN; conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10); hci_conn_hold(conn); hci_connect_pa_sync(hdev, conn); return conn; } int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]) { int err; if (num_bis < 0x01 || num_bis > ISO_MAX_NUM_BIS) return -EINVAL; err = qos_set_big(hdev, qos); if (err) return err; if (hcon) { /* Update hcon QoS */ hcon->iso_qos = *qos; hcon->num_bis = num_bis; memcpy(hcon->bis, bis, num_bis); hcon->conn_timeout = msecs_to_jiffies(qos->bcast.timeout * 10); } return hci_connect_big_sync(hdev, hcon); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; bt_dev_dbg(hdev, "conn %p", conn); if (err) { bt_dev_err(hdev, "Unable to create BIG: %d", err); hci_connect_cfm(conn, err); hci_conn_del(conn); } } struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; struct hci_conn *parent; __u8 eir[HCI_MAX_PER_AD_LENGTH]; struct hci_link *link; /* Look for any BIS that is open for rebinding */ conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN, HCI_ROLE_MASTER); if (conn) { memcpy(qos, &conn->iso_qos, sizeof(*qos)); conn->state = BT_CONNECTED; return conn; } if (base_len && base) base_len = eir_append_service_data(eir, 0, 0x1851, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout); if (IS_ERR(conn)) return conn; /* Update LINK PHYs according to QoS preference */ conn->le_tx_phy = qos->bcast.out.phy; conn->le_tx_phy = qos->bcast.out.phy; /* Add Basic Announcement into Peridic Adv Data if BASE is set */ if (base_len && base) { memcpy(conn->le_per_adv_data, eir, sizeof(eir)); conn->le_per_adv_data_len = base_len; } hci_iso_qos_setup(hdev, conn, &qos->bcast.out, conn->le_tx_phy ? conn->le_tx_phy : hdev->le_tx_def_phys); conn->iso_qos = *qos; conn->state = BT_BOUND; /* Link BISes together */ parent = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); if (parent && parent != conn) { link = hci_conn_link(parent, conn); hci_conn_drop(conn); if (!link) return ERR_PTR(-ENOLINK); } return conn; } static void bis_mark_per_adv(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Skip if not broadcast/ANY address */ if (bacmp(&conn->dst, BDADDR_ANY)) return; if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET || d->bis != conn->iso_qos.bcast.bis) return; set_bit(HCI_CONN_PER_ADV, &conn->flags); } struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; struct iso_list_data data; conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout); if (IS_ERR(conn)) return conn; if (conn->state == BT_CONNECTED) return conn; /* Check if SID needs to be allocated then search for the first * available. */ if (conn->sid == HCI_SID_INVALID) { u8 sid; for (sid = 0; sid <= 0x0f; sid++) { if (!hci_find_adv_sid(hdev, sid)) { conn->sid = sid; break; } } } data.big = qos->bcast.big; data.bis = qos->bcast.bis; /* Set HCI_CONN_PER_ADV for all bound connections, to mark that * the start periodic advertising and create BIG commands have * been queued */ hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ err = hci_cmd_sync_queue(hdev, create_big_sync, conn, create_big_complete); if (err < 0) { hci_conn_drop(conn); return ERR_PTR(err); } return conn; } struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos, u16 timeout) { struct hci_conn *le; struct hci_conn *cis; struct hci_link *link; if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) le = hci_connect_le(hdev, dst, dst_type, false, BT_SECURITY_LOW, HCI_LE_CONN_TIMEOUT, HCI_ROLE_SLAVE, 0, 0); else le = hci_connect_le_scan(hdev, dst, dst_type, BT_SECURITY_LOW, HCI_LE_CONN_TIMEOUT, CONN_REASON_ISO_CONNECT); if (IS_ERR(le)) return le; hci_iso_qos_setup(hdev, le, &qos->ucast.out, le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys); hci_iso_qos_setup(hdev, le, &qos->ucast.in, le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { hci_conn_drop(le); return cis; } link = hci_conn_link(le, cis); hci_conn_drop(cis); if (!link) { hci_conn_drop(le); return ERR_PTR(-ENOLINK); } cis->state = BT_CONNECT; hci_le_create_cis_pending(hdev); return cis; } /* Check link security requirement */ int hci_conn_check_link_mode(struct hci_conn *conn) { BT_DBG("hcon %p", conn); /* In Secure Connections Only mode, it is required that Secure * Connections is used and the link is encrypted with AES-CCM * using a P-256 authenticated combination key. */ if (hci_dev_test_flag(conn->hdev, HCI_SC_ONLY)) { if (!hci_conn_sc_enabled(conn) || !test_bit(HCI_CONN_AES_CCM, &conn->flags) || conn->key_type != HCI_LK_AUTH_COMBINATION_P256) return 0; } /* AES encryption is required for Level 4: * * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C * page 1319: * * 128-bit equivalent strength for link and encryption keys * required using FIPS approved algorithms (E0 not allowed, * SAFER+ not allowed, and P-192 not allowed; encryption key * not shortened) */ if (conn->sec_level == BT_SECURITY_FIPS && !test_bit(HCI_CONN_AES_CCM, &conn->flags)) { bt_dev_err(conn->hdev, "Invalid security: Missing AES-CCM usage"); return 0; } if (hci_conn_ssp_enabled(conn) && !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; return 1; } /* Authenticate remote device */ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("hcon %p", conn); if (conn->pending_sec_level > sec_level) sec_level = conn->pending_sec_level; if (sec_level > conn->sec_level) conn->pending_sec_level = sec_level; else if (test_bit(HCI_CONN_AUTH, &conn->flags)) return 1; /* Make sure we preserve an existing MITM requirement*/ auth_type |= (conn->auth_type & 0x01); conn->auth_type = auth_type; if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { struct hci_cp_auth_requested cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); /* Set the ENCRYPT_PEND to trigger encryption after * authentication. */ if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); } return 0; } /* Encrypt the link */ static void hci_conn_encrypt(struct hci_conn *conn) { BT_DBG("hcon %p", conn); if (!test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) { struct hci_cp_set_conn_encrypt cp; cp.handle = cpu_to_le16(conn->handle); cp.encrypt = 0x01; hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), &cp); } } /* Enable security */ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, bool initiator) { BT_DBG("hcon %p", conn); if (conn->type == LE_LINK) return smp_conn_security(conn, sec_level); /* For sdp we don't need the link key. */ if (sec_level == BT_SECURITY_SDP) return 1; /* For non 2.1 devices and low security level we don't need the link key. */ if (sec_level == BT_SECURITY_LOW && !hci_conn_ssp_enabled(conn)) return 1; /* For other security levels we need the link key. */ if (!test_bit(HCI_CONN_AUTH, &conn->flags)) goto auth; switch (conn->key_type) { case HCI_LK_AUTH_COMBINATION_P256: /* An authenticated FIPS approved combination key has * sufficient security for security level 4 or lower. */ if (sec_level <= BT_SECURITY_FIPS) goto encrypt; break; case HCI_LK_AUTH_COMBINATION_P192: /* An authenticated combination key has sufficient security for * security level 3 or lower. */ if (sec_level <= BT_SECURITY_HIGH) goto encrypt; break; case HCI_LK_UNAUTH_COMBINATION_P192: case HCI_LK_UNAUTH_COMBINATION_P256: /* An unauthenticated combination key has sufficient security * for security level 2 or lower. */ if (sec_level <= BT_SECURITY_MEDIUM) goto encrypt; break; case HCI_LK_COMBINATION: /* A combination key has always sufficient security for the * security levels 2 or lower. High security level requires the * combination key is generated using maximum PIN code length * (16). For pre 2.1 units. */ if (sec_level <= BT_SECURITY_MEDIUM || conn->pin_length == 16) goto encrypt; break; default: break; } auth: if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) return 0; if (initiator) set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags); if (!hci_conn_auth(conn, sec_level, auth_type)) return 0; encrypt: if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) { /* Ensure that the encryption key size has been read, * otherwise stall the upper layer responses. */ if (!conn->enc_key_size) return 0; /* Nothing else needed, all requirements are met */ return 1; } hci_conn_encrypt(conn); return 0; } EXPORT_SYMBOL(hci_conn_security); /* Check secure link requirement */ int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level) { BT_DBG("hcon %p", conn); /* Accept if non-secure or higher security level is required */ if (sec_level != BT_SECURITY_HIGH && sec_level != BT_SECURITY_FIPS) return 1; /* Accept if secure or higher security level is already present */ if (conn->sec_level == BT_SECURITY_HIGH || conn->sec_level == BT_SECURITY_FIPS) return 1; /* Reject not secure link */ return 0; } EXPORT_SYMBOL(hci_conn_check_secure); /* Switch role */ int hci_conn_switch_role(struct hci_conn *conn, __u8 role) { BT_DBG("hcon %p", conn); if (role == conn->role) return 1; if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->flags)) { struct hci_cp_switch_role cp; bacpy(&cp.bdaddr, &conn->dst); cp.role = role; hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp); } return 0; } EXPORT_SYMBOL(hci_conn_switch_role); /* Enter active mode */ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active) { struct hci_dev *hdev = conn->hdev; BT_DBG("hcon %p mode %d", conn, conn->mode); if (conn->mode != HCI_CM_SNIFF) goto timer; if (!test_bit(HCI_CONN_POWER_SAVE, &conn->flags) && !force_active) goto timer; if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) { struct hci_cp_exit_sniff_mode cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_EXIT_SNIFF_MODE, sizeof(cp), &cp); } timer: if (hdev->idle_timeout > 0) queue_delayed_work(hdev->workqueue, &conn->idle_work, msecs_to_jiffies(hdev->idle_timeout)); } /* Drop all connection on the device */ void hci_conn_hash_flush(struct hci_dev *hdev) { struct list_head *head = &hdev->conn_hash.list; struct hci_conn *conn; BT_DBG("hdev %s", hdev->name); /* We should not traverse the list here, because hci_conn_del * can remove extra links, which may cause the list traversal * to hit items that have already been released. */ while ((conn = list_first_entry_or_null(head, struct hci_conn, list)) != NULL) { conn->state = BT_CLOSED; hci_disconn_cfm(conn, HCI_ERROR_LOCAL_HOST_TERM); hci_conn_del(conn); } } static u32 get_link_mode(struct hci_conn *conn) { u32 link_mode = 0; if (conn->role == HCI_ROLE_MASTER) link_mode |= HCI_LM_MASTER; if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) link_mode |= HCI_LM_ENCRYPT; if (test_bit(HCI_CONN_AUTH, &conn->flags)) link_mode |= HCI_LM_AUTH; if (test_bit(HCI_CONN_SECURE, &conn->flags)) link_mode |= HCI_LM_SECURE; if (test_bit(HCI_CONN_FIPS, &conn->flags)) link_mode |= HCI_LM_FIPS; return link_mode; } int hci_get_conn_list(void __user *arg) { struct hci_conn *c; struct hci_conn_list_req req, *cl; struct hci_conn_info *ci; struct hci_dev *hdev; int n = 0, size, err; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; if (!req.conn_num || req.conn_num > (PAGE_SIZE * 2) / sizeof(*ci)) return -EINVAL; size = sizeof(req) + req.conn_num * sizeof(*ci); cl = kmalloc(size, GFP_KERNEL); if (!cl) return -ENOMEM; hdev = hci_dev_get(req.dev_id); if (!hdev) { kfree(cl); return -ENODEV; } ci = cl->conn_info; hci_dev_lock(hdev); list_for_each_entry(c, &hdev->conn_hash.list, list) { bacpy(&(ci + n)->bdaddr, &c->dst); (ci + n)->handle = c->handle; (ci + n)->type = c->type; (ci + n)->out = c->out; (ci + n)->state = c->state; (ci + n)->link_mode = get_link_mode(c); if (++n >= req.conn_num) break; } hci_dev_unlock(hdev); cl->dev_id = hdev->id; cl->conn_num = n; size = sizeof(req) + n * sizeof(*ci); hci_dev_put(hdev); err = copy_to_user(arg, cl, size); kfree(cl); return err ? -EFAULT : 0; } int hci_get_conn_info(struct hci_dev *hdev, void __user *arg) { struct hci_conn_info_req req; struct hci_conn_info ci; struct hci_conn *conn; char __user *ptr = arg + sizeof(req); if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr); if (conn) { bacpy(&ci.bdaddr, &conn->dst); ci.handle = conn->handle; ci.type = conn->type; ci.out = conn->out; ci.state = conn->state; ci.link_mode = get_link_mode(conn); } hci_dev_unlock(hdev); if (!conn) return -ENOENT; return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0; } int hci_get_auth_info(struct hci_dev *hdev, void __user *arg) { struct hci_auth_info_req req; struct hci_conn *conn; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr); if (conn) req.type = conn->auth_type; hci_dev_unlock(hdev); if (!conn) return -ENOENT; return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0; } struct hci_chan *hci_chan_create(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct hci_chan *chan; BT_DBG("%s hcon %p", hdev->name, conn); if (test_bit(HCI_CONN_DROP, &conn->flags)) { BT_DBG("Refusing to create new hci_chan"); return NULL; } chan = kzalloc(sizeof(*chan), GFP_KERNEL); if (!chan) return NULL; chan->conn = hci_conn_get(conn); skb_queue_head_init(&chan->data_q); chan->state = BT_CONNECTED; list_add_rcu(&chan->list, &conn->chan_list); return chan; } void hci_chan_del(struct hci_chan *chan) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; BT_DBG("%s hcon %p chan %p", hdev->name, conn, chan); list_del_rcu(&chan->list); synchronize_rcu(); /* Prevent new hci_chan's to be created for this hci_conn */ set_bit(HCI_CONN_DROP, &conn->flags); hci_conn_put(conn); skb_queue_purge(&chan->data_q); kfree(chan); } void hci_chan_list_flush(struct hci_conn *conn) { struct hci_chan *chan, *n; BT_DBG("hcon %p", conn); list_for_each_entry_safe(chan, n, &conn->chan_list, list) hci_chan_del(chan); } static struct hci_chan *__hci_chan_lookup_handle(struct hci_conn *hcon, __u16 handle) { struct hci_chan *hchan; list_for_each_entry(hchan, &hcon->chan_list, list) { if (hchan->handle == handle) return hchan; } return NULL; } struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *hcon; struct hci_chan *hchan = NULL; rcu_read_lock(); list_for_each_entry_rcu(hcon, &h->list, list) { hchan = __hci_chan_lookup_handle(hcon, handle); if (hchan) break; } rcu_read_unlock(); return hchan; } u32 hci_conn_get_phy(struct hci_conn *conn) { u32 phys = 0; /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471: * Table 6.2: Packets defined for synchronous, asynchronous, and * CPB logical transport types. */ switch (conn->type) { case SCO_LINK: /* SCO logical transport (1 Mb/s): * HV1, HV2, HV3 and DV. */ phys |= BT_PHY_BR_1M_1SLOT; break; case ACL_LINK: /* ACL logical transport (1 Mb/s) ptt=0: * DH1, DM3, DH3, DM5 and DH5. */ phys |= BT_PHY_BR_1M_1SLOT; if (conn->pkt_type & (HCI_DM3 | HCI_DH3)) phys |= BT_PHY_BR_1M_3SLOT; if (conn->pkt_type & (HCI_DM5 | HCI_DH5)) phys |= BT_PHY_BR_1M_5SLOT; /* ACL logical transport (2 Mb/s) ptt=1: * 2-DH1, 2-DH3 and 2-DH5. */ if (!(conn->pkt_type & HCI_2DH1)) phys |= BT_PHY_EDR_2M_1SLOT; if (!(conn->pkt_type & HCI_2DH3)) phys |= BT_PHY_EDR_2M_3SLOT; if (!(conn->pkt_type & HCI_2DH5)) phys |= BT_PHY_EDR_2M_5SLOT; /* ACL logical transport (3 Mb/s) ptt=1: * 3-DH1, 3-DH3 and 3-DH5. */ if (!(conn->pkt_type & HCI_3DH1)) phys |= BT_PHY_EDR_3M_1SLOT; if (!(conn->pkt_type & HCI_3DH3)) phys |= BT_PHY_EDR_3M_3SLOT; if (!(conn->pkt_type & HCI_3DH5)) phys |= BT_PHY_EDR_3M_5SLOT; break; case ESCO_LINK: /* eSCO logical transport (1 Mb/s): EV3, EV4 and EV5 */ phys |= BT_PHY_BR_1M_1SLOT; if (!(conn->pkt_type & (ESCO_EV4 | ESCO_EV5))) phys |= BT_PHY_BR_1M_3SLOT; /* eSCO logical transport (2 Mb/s): 2-EV3, 2-EV5 */ if (!(conn->pkt_type & ESCO_2EV3)) phys |= BT_PHY_EDR_2M_1SLOT; if (!(conn->pkt_type & ESCO_2EV5)) phys |= BT_PHY_EDR_2M_3SLOT; /* eSCO logical transport (3 Mb/s): 3-EV3, 3-EV5 */ if (!(conn->pkt_type & ESCO_3EV3)) phys |= BT_PHY_EDR_3M_1SLOT; if (!(conn->pkt_type & ESCO_3EV5)) phys |= BT_PHY_EDR_3M_3SLOT; break; case LE_LINK: if (conn->le_tx_phy & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_TX; if (conn->le_rx_phy & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_RX; if (conn->le_tx_phy & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_TX; if (conn->le_rx_phy & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_RX; if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_TX; if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_RX; break; } return phys; } static int abort_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; return hci_abort_conn_sync(hdev, conn, conn->abort_reason); } int hci_abort_conn(struct hci_conn *conn, u8 reason) { struct hci_dev *hdev = conn->hdev; /* If abort_reason has already been set it means the connection is * already being aborted so don't attempt to overwrite it. */ if (conn->abort_reason) return 0; bt_dev_dbg(hdev, "handle 0x%2.2x reason 0x%2.2x", conn->handle, reason); conn->abort_reason = reason; /* If the connection is pending check the command opcode since that * might be blocking on hci_cmd_sync_work while waiting its respective * event so we need to hci_cmd_sync_cancel to cancel it. * * hci_connect_le serializes the connection attempts so only one * connection can be in BT_CONNECT at time. */ if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) { switch (hci_skb_event(hdev->sent_cmd)) { case HCI_EV_CONN_COMPLETE: case HCI_EV_LE_CONN_COMPLETE: case HCI_EV_LE_ENHANCED_CONN_COMPLETE: case HCI_EVT_LE_CIS_ESTABLISHED: hci_cmd_sync_cancel(hdev, ECANCELED); break; } /* Cancel connect attempt if still queued/pending */ } else if (!hci_cancel_connect_sync(hdev, conn)) { return 0; } /* Run immediately if on cmd_sync_work since this may be called * as a result to MGMT_OP_DISCONNECT/MGMT_OP_UNPAIR which does * already queue its callback on cmd_sync_work. */ return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL); } void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, const struct sockcm_cookie *sockc) { struct sock *sk = skb ? skb->sk : NULL; int key; /* This shall be called on a single skb of those generated by user * sendmsg(), and only when the sendmsg() does not return error to * user. This is required for keeping the tskey that increments here in * sync with possible sendmsg() counting by user. * * Stream sockets shall set key_offset to sendmsg() length in bytes * and call with the last fragment, others to 1 and first fragment. */ if (!skb || !sockc || !sk || !key_offset) return; sock_tx_timestamp(sk, sockc, &skb_shinfo(skb)->tx_flags); if (sk->sk_type == SOCK_STREAM) key = atomic_add_return(key_offset, &sk->sk_tskey); if (sockc->tsflags & SOF_TIMESTAMPING_OPT_ID && sockc->tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { if (sockc->tsflags & SOCKCM_FLAG_TS_OPT_ID) { skb_shinfo(skb)->tskey = sockc->ts_opt_id; } else { if (sk->sk_type != SOCK_STREAM) key = atomic_inc_return(&sk->sk_tskey); skb_shinfo(skb)->tskey = key - 1; } } } void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb) { struct tx_queue *comp = &conn->tx_q; bool track = false; /* Emit SND now, ie. just before sending to driver */ if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND); /* COMPLETION tstamp is emitted for tracked skb later in Number of * Completed Packets event. Available only for flow controlled cases. * * TODO: SCO support without flowctl (needs to be done in drivers) */ switch (conn->type) { case CIS_LINK: case BIS_LINK: case PA_LINK: case ACL_LINK: case LE_LINK: break; case SCO_LINK: case ESCO_LINK: if (!hci_dev_test_flag(conn->hdev, HCI_SCO_FLOWCTL)) return; break; default: return; } if (skb->sk && (skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP)) track = true; /* If nothing is tracked, just count extra skbs at the queue head */ if (!track && !comp->tracked) { comp->extra++; return; } if (track) { skb = skb_clone_sk(skb); if (!skb) goto count_only; comp->tracked++; } else { skb = skb_clone(skb, GFP_KERNEL); if (!skb) goto count_only; } skb_queue_tail(&comp->queue, skb); return; count_only: /* Stop tracking skbs, and only count. This will not emit timestamps for * the packets, but if we get here something is more seriously wrong. */ comp->tracked = 0; comp->extra += skb_queue_len(&comp->queue) + 1; skb_queue_purge(&comp->queue); } void hci_conn_tx_dequeue(struct hci_conn *conn) { struct tx_queue *comp = &conn->tx_q; struct sk_buff *skb; /* If there are tracked skbs, the counted extra go before dequeuing real * skbs, to keep ordering. When nothing is tracked, the ordering doesn't * matter so dequeue real skbs first to get rid of them ASAP. */ if (comp->extra && (comp->tracked || skb_queue_empty(&comp->queue))) { comp->extra--; return; } skb = skb_dequeue(&comp->queue); if (!skb) return; if (skb->sk) { comp->tracked--; __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_COMPLETION); } kfree_skb(skb); } u8 *hci_conn_key_enc_size(struct hci_conn *conn) { if (conn->type == ACL_LINK) { struct link_key *key; key = hci_find_link_key(conn->hdev, &conn->dst); if (!key) return NULL; return &key->pin_len; } else if (conn->type == LE_LINK) { struct smp_ltk *ltk; ltk = hci_find_ltk(conn->hdev, &conn->dst, conn->dst_type, conn->role); if (!ltk) return NULL; return &ltk->enc_size; } return NULL; } int hci_ethtool_ts_info(unsigned int index, int sk_proto, struct kernel_ethtool_ts_info *info) { struct hci_dev *hdev; hdev = hci_dev_get(index); if (!hdev) return -ENODEV; info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; info->tx_types = BIT(HWTSTAMP_TX_OFF); info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); switch (sk_proto) { case BTPROTO_ISO: case BTPROTO_L2CAP: info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; break; case BTPROTO_SCO: info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; if (hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; break; } hci_dev_put(hdev); return 0; }
3 39 1 1 6 3 7 28 20 3 3 3 3 3 1 1 1 2 23 1 23 7 2 2 4 4 159 12 1 152 17 16 1 1 17 223 1 15 221 220 43 21 22 2 39 3 1 35 2 1 66 4 3 31 2 12 18 181 3 155 22 23 1 11 11 19 2 13 1 3 1 1 1 3 15 38 2 22 14 1 1 54 2 2 25 26 103 2 1 24 8 1 98 3 31 1 1 21 7 6 2 2 4 6 2 6 6 4 6 6 6 6 4 6 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 // SPDX-License-Identifier: GPL-2.0-only /* Updated: Karl MacMillan <kmacmillan@tresys.com> * * Added conditional policy language extensions * * Updated: Hewlett-Packard <paul@paul-moore.com> * * Added support for the policy capability bitmap * * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/kernel.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/init.h> #include <linux/string.h> #include <linux/security.h> #include <linux/major.h> #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/audit.h> #include <linux/uaccess.h> #include <linux/kobject.h> #include <linux/ctype.h> /* selinuxfs pseudo filesystem for exporting the security policy API. Based on the proc code and the fs/nfsd/nfsctl.c code. */ #include "flask.h" #include "avc.h" #include "avc_ss.h" #include "security.h" #include "objsec.h" #include "conditional.h" #include "ima.h" enum sel_inos { SEL_ROOT_INO = 2, SEL_LOAD, /* load policy */ SEL_ENFORCE, /* get or set enforcing status */ SEL_CONTEXT, /* validate context */ SEL_ACCESS, /* compute access decision */ SEL_CREATE, /* compute create labeling decision */ SEL_RELABEL, /* compute relabeling decision */ SEL_USER, /* compute reachable user contexts */ SEL_POLICYVERS, /* return policy version for this kernel */ SEL_COMMIT_BOOLS, /* commit new boolean values */ SEL_MLS, /* return if MLS policy is enabled */ SEL_DISABLE, /* disable SELinux until next reboot */ SEL_MEMBER, /* compute polyinstantiation membership decision */ SEL_CHECKREQPROT, /* check requested protection, not kernel-applied one */ SEL_COMPAT_NET, /* whether to use old compat network packet controls */ SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */ SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */ SEL_STATUS, /* export current status using mmap() */ SEL_POLICY, /* allow userspace to read the in kernel policy */ SEL_VALIDATE_TRANS, /* compute validatetrans decision */ SEL_INO_NEXT, /* The next inode number to use */ }; struct selinux_fs_info { struct dentry *bool_dir; unsigned int bool_num; char **bool_pending_names; int *bool_pending_values; struct dentry *class_dir; unsigned long last_class_ino; bool policy_opened; struct dentry *policycap_dir; unsigned long last_ino; struct super_block *sb; }; static int selinux_fs_info_create(struct super_block *sb) { struct selinux_fs_info *fsi; fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); if (!fsi) return -ENOMEM; fsi->last_ino = SEL_INO_NEXT - 1; fsi->sb = sb; sb->s_fs_info = fsi; return 0; } static void selinux_fs_info_free(struct super_block *sb) { struct selinux_fs_info *fsi = sb->s_fs_info; unsigned int i; if (fsi) { for (i = 0; i < fsi->bool_num; i++) kfree(fsi->bool_pending_names[i]); kfree(fsi->bool_pending_names); kfree(fsi->bool_pending_values); } kfree(sb->s_fs_info); sb->s_fs_info = NULL; } #define SEL_INITCON_INO_OFFSET 0x01000000 #define SEL_BOOL_INO_OFFSET 0x02000000 #define SEL_CLASS_INO_OFFSET 0x04000000 #define SEL_POLICYCAP_INO_OFFSET 0x08000000 #define SEL_INO_MASK 0x00ffffff #define BOOL_DIR_NAME "booleans" #define CLASS_DIR_NAME "class" #define POLICYCAP_DIR_NAME "policy_capabilities" #define TMPBUFLEN 12 static ssize_t sel_read_enforce(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%d", enforcing_enabled()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } #ifdef CONFIG_SECURITY_SELINUX_DEVELOP static ssize_t sel_write_enforce(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *page = NULL; ssize_t length; int scan_value; bool old_value, new_value; if (count >= PAGE_SIZE) return -ENOMEM; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user_nul(buf, count); if (IS_ERR(page)) return PTR_ERR(page); length = -EINVAL; if (sscanf(page, "%d", &scan_value) != 1) goto out; new_value = !!scan_value; old_value = enforcing_enabled(); if (new_value != old_value) { length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETENFORCE, NULL); if (length) goto out; audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_STATUS, "enforcing=%d old_enforcing=%d auid=%u ses=%u" " enabled=1 old-enabled=1 lsm=selinux res=1", new_value, old_value, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); enforcing_set(new_value); if (new_value) avc_ss_reset(0); selnl_notify_setenforce(new_value); selinux_status_update_setenforce(new_value); if (!new_value) call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL); selinux_ima_measure_state(); } length = count; out: kfree(page); return length; } #else #define sel_write_enforce NULL #endif static const struct file_operations sel_enforce_ops = { .read = sel_read_enforce, .write = sel_write_enforce, .llseek = generic_file_llseek, }; static ssize_t sel_read_handle_unknown(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char tmpbuf[TMPBUFLEN]; ssize_t length; ino_t ino = file_inode(filp)->i_ino; int handle_unknown = (ino == SEL_REJECT_UNKNOWN) ? security_get_reject_unknown() : !security_get_allow_unknown(); length = scnprintf(tmpbuf, TMPBUFLEN, "%d", handle_unknown); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations sel_handle_unknown_ops = { .read = sel_read_handle_unknown, .llseek = generic_file_llseek, }; static int sel_open_handle_status(struct inode *inode, struct file *filp) { struct page *status = selinux_kernel_status_page(); if (!status) return -ENOMEM; filp->private_data = status; return 0; } static ssize_t sel_read_handle_status(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct page *status = filp->private_data; BUG_ON(!status); return simple_read_from_buffer(buf, count, ppos, page_address(status), sizeof(struct selinux_kernel_status)); } static int sel_mmap_handle_status(struct file *filp, struct vm_area_struct *vma) { struct page *status = filp->private_data; unsigned long size = vma->vm_end - vma->vm_start; BUG_ON(!status); /* only allows one page from the head */ if (vma->vm_pgoff > 0 || size != PAGE_SIZE) return -EIO; /* disallow writable mapping */ if (vma->vm_flags & VM_WRITE) return -EPERM; /* disallow mprotect() turns it into writable */ vm_flags_clear(vma, VM_MAYWRITE); return remap_pfn_range(vma, vma->vm_start, page_to_pfn(status), size, vma->vm_page_prot); } static const struct file_operations sel_handle_status_ops = { .open = sel_open_handle_status, .read = sel_read_handle_status, .mmap = sel_mmap_handle_status, .llseek = generic_file_llseek, }; static ssize_t sel_write_disable(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *page; ssize_t length; int new_value; if (count >= PAGE_SIZE) return -ENOMEM; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user_nul(buf, count); if (IS_ERR(page)) return PTR_ERR(page); if (sscanf(page, "%d", &new_value) != 1) { length = -EINVAL; goto out; } length = count; if (new_value) { pr_err("SELinux: https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n"); pr_err("SELinux: Runtime disable is not supported, use selinux=0 on the kernel cmdline.\n"); } out: kfree(page); return length; } static const struct file_operations sel_disable_ops = { .write = sel_write_disable, .llseek = generic_file_llseek, }; static ssize_t sel_read_policyvers(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", POLICYDB_VERSION_MAX); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations sel_policyvers_ops = { .read = sel_read_policyvers, .llseek = generic_file_llseek, }; /* declaration for sel_write_load */ static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir, unsigned int *bool_num, char ***bool_pending_names, int **bool_pending_values); static int sel_make_classes(struct selinux_policy *newpolicy, struct dentry *class_dir, unsigned long *last_class_ino); /* declaration for sel_make_class_dirs */ static struct dentry *sel_make_dir(struct dentry *dir, const char *name, unsigned long *ino); /* declaration for sel_make_policy_nodes */ static struct dentry *sel_make_swapover_dir(struct super_block *sb, unsigned long *ino); static ssize_t sel_read_mls(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%d", security_mls_enabled()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations sel_mls_ops = { .read = sel_read_mls, .llseek = generic_file_llseek, }; struct policy_load_memory { size_t len; void *data; }; static int sel_open_policy(struct inode *inode, struct file *filp) { struct selinux_fs_info *fsi = inode->i_sb->s_fs_info; struct policy_load_memory *plm = NULL; int rc; BUG_ON(filp->private_data); mutex_lock(&selinux_state.policy_mutex); rc = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__READ_POLICY, NULL); if (rc) goto err; rc = -EBUSY; if (fsi->policy_opened) goto err; rc = -ENOMEM; plm = kzalloc(sizeof(*plm), GFP_KERNEL); if (!plm) goto err; rc = security_read_policy(&plm->data, &plm->len); if (rc) goto err; if ((size_t)i_size_read(inode) != plm->len) { inode_lock(inode); i_size_write(inode, plm->len); inode_unlock(inode); } fsi->policy_opened = 1; filp->private_data = plm; mutex_unlock(&selinux_state.policy_mutex); return 0; err: mutex_unlock(&selinux_state.policy_mutex); if (plm) vfree(plm->data); kfree(plm); return rc; } static int sel_release_policy(struct inode *inode, struct file *filp) { struct selinux_fs_info *fsi = inode->i_sb->s_fs_info; struct policy_load_memory *plm = filp->private_data; BUG_ON(!plm); fsi->policy_opened = 0; vfree(plm->data); kfree(plm); return 0; } static ssize_t sel_read_policy(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct policy_load_memory *plm = filp->private_data; int ret; ret = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__READ_POLICY, NULL); if (ret) return ret; return simple_read_from_buffer(buf, count, ppos, plm->data, plm->len); } static vm_fault_t sel_mmap_policy_fault(struct vm_fault *vmf) { struct policy_load_memory *plm = vmf->vma->vm_file->private_data; unsigned long offset; struct page *page; if (vmf->flags & (FAULT_FLAG_MKWRITE | FAULT_FLAG_WRITE)) return VM_FAULT_SIGBUS; offset = vmf->pgoff << PAGE_SHIFT; if (offset >= roundup(plm->len, PAGE_SIZE)) return VM_FAULT_SIGBUS; page = vmalloc_to_page(plm->data + offset); get_page(page); vmf->page = page; return 0; } static const struct vm_operations_struct sel_mmap_policy_ops = { .fault = sel_mmap_policy_fault, .page_mkwrite = sel_mmap_policy_fault, }; static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) { /* do not allow mprotect to make mapping writable */ vm_flags_clear(vma, VM_MAYWRITE); if (vma->vm_flags & VM_WRITE) return -EACCES; } vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &sel_mmap_policy_ops; return 0; } static const struct file_operations sel_policy_ops = { .open = sel_open_policy, .read = sel_read_policy, .mmap = sel_mmap_policy, .release = sel_release_policy, .llseek = generic_file_llseek, }; static void sel_remove_old_bool_data(unsigned int bool_num, char **bool_names, int *bool_values) { u32 i; /* bool_dir cleanup */ for (i = 0; i < bool_num; i++) kfree(bool_names[i]); kfree(bool_names); kfree(bool_values); } static int sel_make_policy_nodes(struct selinux_fs_info *fsi, struct selinux_policy *newpolicy) { int ret = 0; struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir; struct renamedata rd = {}; unsigned int bool_num = 0; char **bool_names = NULL; int *bool_values = NULL; unsigned long tmp_ino = fsi->last_ino; /* Don't increment last_ino in this function */ tmp_parent = sel_make_swapover_dir(fsi->sb, &tmp_ino); if (IS_ERR(tmp_parent)) return PTR_ERR(tmp_parent); tmp_ino = fsi->bool_dir->d_inode->i_ino - 1; /* sel_make_dir will increment and set */ tmp_bool_dir = sel_make_dir(tmp_parent, BOOL_DIR_NAME, &tmp_ino); if (IS_ERR(tmp_bool_dir)) { ret = PTR_ERR(tmp_bool_dir); goto out; } tmp_ino = fsi->class_dir->d_inode->i_ino - 1; /* sel_make_dir will increment and set */ tmp_class_dir = sel_make_dir(tmp_parent, CLASS_DIR_NAME, &tmp_ino); if (IS_ERR(tmp_class_dir)) { ret = PTR_ERR(tmp_class_dir); goto out; } ret = sel_make_bools(newpolicy, tmp_bool_dir, &bool_num, &bool_names, &bool_values); if (ret) goto out; ret = sel_make_classes(newpolicy, tmp_class_dir, &fsi->last_class_ino); if (ret) goto out; rd.old_parent = tmp_parent; rd.new_parent = fsi->sb->s_root; /* booleans */ ret = start_renaming_two_dentries(&rd, tmp_bool_dir, fsi->bool_dir); if (ret) goto out; d_exchange(tmp_bool_dir, fsi->bool_dir); swap(fsi->bool_num, bool_num); swap(fsi->bool_pending_names, bool_names); swap(fsi->bool_pending_values, bool_values); fsi->bool_dir = tmp_bool_dir; end_renaming(&rd); /* classes */ ret = start_renaming_two_dentries(&rd, tmp_class_dir, fsi->class_dir); if (ret) goto out; d_exchange(tmp_class_dir, fsi->class_dir); fsi->class_dir = tmp_class_dir; end_renaming(&rd); out: sel_remove_old_bool_data(bool_num, bool_names, bool_values); /* Since the other temporary dirs are children of tmp_parent * this will handle all the cleanup in the case of a failure before * the swapover */ simple_recursive_removal(tmp_parent, NULL); return ret; } static ssize_t sel_write_load(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct selinux_fs_info *fsi; struct selinux_load_state load_state; ssize_t length; void *data = NULL; /* no partial writes */ if (*ppos) return -EINVAL; /* no empty policies */ if (!count) return -EINVAL; mutex_lock(&selinux_state.policy_mutex); length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__LOAD_POLICY, NULL); if (length) goto out; data = vmalloc(count); if (!data) { length = -ENOMEM; goto out; } if (copy_from_user(data, buf, count) != 0) { length = -EFAULT; goto out; } length = security_load_policy(data, count, &load_state); if (length) { pr_warn_ratelimited("SELinux: failed to load policy\n"); goto out; } fsi = file_inode(file)->i_sb->s_fs_info; length = sel_make_policy_nodes(fsi, load_state.policy); if (length) { pr_warn_ratelimited("SELinux: failed to initialize selinuxfs\n"); selinux_policy_cancel(&load_state); goto out; } selinux_policy_commit(&load_state); length = count; audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_POLICY_LOAD, "auid=%u ses=%u lsm=selinux res=1", from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); out: mutex_unlock(&selinux_state.policy_mutex); vfree(data); return length; } static const struct file_operations sel_load_ops = { .write = sel_write_load, .llseek = generic_file_llseek, }; static ssize_t sel_write_context(struct file *file, char *buf, size_t size) { char *canon = NULL; u32 sid, len; ssize_t length; length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__CHECK_CONTEXT, NULL); if (length) goto out; length = security_context_to_sid(buf, size, &sid, GFP_KERNEL); if (length) goto out; length = security_sid_to_context(sid, &canon, &len); if (length) goto out; length = -ERANGE; if (len > SIMPLE_TRANSACTION_LIMIT) { pr_err("SELinux: %s: context size (%u) exceeds " "payload max\n", __func__, len); goto out; } memcpy(buf, canon, len); length = len; out: kfree(canon); return length; } static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", checkreqprot_get()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *page; ssize_t length; unsigned int new_value; length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETCHECKREQPROT, NULL); if (length) return length; if (count >= PAGE_SIZE) return -ENOMEM; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user_nul(buf, count); if (IS_ERR(page)) return PTR_ERR(page); if (sscanf(page, "%u", &new_value) != 1) { length = -EINVAL; goto out; } length = count; if (new_value) { char comm[sizeof(current->comm)]; strscpy(comm, current->comm); pr_err("SELinux: %s (%d) set checkreqprot to 1. This is no longer supported.\n", comm, current->pid); } selinux_ima_measure_state(); out: kfree(page); return length; } static const struct file_operations sel_checkreqprot_ops = { .read = sel_read_checkreqprot, .write = sel_write_checkreqprot, .llseek = generic_file_llseek, }; static ssize_t sel_write_validatetrans(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *oldcon = NULL, *newcon = NULL, *taskcon = NULL; char *req = NULL; u32 osid, nsid, tsid; u16 tclass; int rc; rc = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__VALIDATE_TRANS, NULL); if (rc) goto out; rc = -ENOMEM; if (count >= PAGE_SIZE) goto out; /* No partial writes. */ rc = -EINVAL; if (*ppos != 0) goto out; req = memdup_user_nul(buf, count); if (IS_ERR(req)) { rc = PTR_ERR(req); req = NULL; goto out; } rc = -ENOMEM; oldcon = kzalloc(count + 1, GFP_KERNEL); if (!oldcon) goto out; newcon = kzalloc(count + 1, GFP_KERNEL); if (!newcon) goto out; taskcon = kzalloc(count + 1, GFP_KERNEL); if (!taskcon) goto out; rc = -EINVAL; if (sscanf(req, "%s %s %hu %s", oldcon, newcon, &tclass, taskcon) != 4) goto out; rc = security_context_str_to_sid(oldcon, &osid, GFP_KERNEL); if (rc) goto out; rc = security_context_str_to_sid(newcon, &nsid, GFP_KERNEL); if (rc) goto out; rc = security_context_str_to_sid(taskcon, &tsid, GFP_KERNEL); if (rc) goto out; rc = security_validate_transition_user(osid, nsid, tsid, tclass); if (!rc) rc = count; out: kfree(req); kfree(oldcon); kfree(newcon); kfree(taskcon); return rc; } static const struct file_operations sel_transition_ops = { .write = sel_write_validatetrans, .llseek = generic_file_llseek, }; /* * Remaining nodes use transaction based IO methods like nfsd/nfsctl.c */ static ssize_t sel_write_access(struct file *file, char *buf, size_t size); static ssize_t sel_write_create(struct file *file, char *buf, size_t size); static ssize_t sel_write_relabel(struct file *file, char *buf, size_t size); static ssize_t sel_write_user(struct file *file, char *buf, size_t size); static ssize_t sel_write_member(struct file *file, char *buf, size_t size); static ssize_t (*const write_op[])(struct file *, char *, size_t) = { [SEL_ACCESS] = sel_write_access, [SEL_CREATE] = sel_write_create, [SEL_RELABEL] = sel_write_relabel, [SEL_USER] = sel_write_user, [SEL_MEMBER] = sel_write_member, [SEL_CONTEXT] = sel_write_context, }; static ssize_t selinux_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) { ino_t ino = file_inode(file)->i_ino; char *data; ssize_t rv; if (ino >= ARRAY_SIZE(write_op) || !write_op[ino]) return -EINVAL; data = simple_transaction_get(file, buf, size); if (IS_ERR(data)) return PTR_ERR(data); rv = write_op[ino](file, data, size); if (rv > 0) { simple_transaction_set(file, rv); rv = size; } return rv; } static const struct file_operations transaction_ops = { .write = selinux_transaction_write, .read = simple_transaction_read, .release = simple_transaction_release, .llseek = generic_file_llseek, }; /* * payload - write methods * If the method has a response, the response should be put in buf, * and the length returned. Otherwise return 0 or and -error. */ static ssize_t sel_write_access(struct file *file, char *buf, size_t size) { char *scon = NULL, *tcon = NULL; u32 ssid, tsid; u16 tclass; struct av_decision avd; ssize_t length; length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_AV, NULL); if (length) goto out; length = -ENOMEM; scon = kzalloc(size + 1, GFP_KERNEL); if (!scon) goto out; length = -ENOMEM; tcon = kzalloc(size + 1, GFP_KERNEL); if (!tcon) goto out; length = -EINVAL; if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; security_compute_av_user(ssid, tsid, tclass, &avd); length = scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%x %x %x %x %u %x", avd.allowed, 0xffffffff, avd.auditallow, avd.auditdeny, avd.seqno, avd.flags); out: kfree(tcon); kfree(scon); return length; } static ssize_t sel_write_create(struct file *file, char *buf, size_t size) { char *scon = NULL, *tcon = NULL; char *namebuf = NULL, *objname = NULL; u32 ssid, tsid, newsid; u16 tclass; ssize_t length; char *newcon = NULL; u32 len; int nargs; length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_CREATE, NULL); if (length) goto out; length = -ENOMEM; scon = kzalloc(size + 1, GFP_KERNEL); if (!scon) goto out; length = -ENOMEM; tcon = kzalloc(size + 1, GFP_KERNEL); if (!tcon) goto out; length = -ENOMEM; namebuf = kzalloc(size + 1, GFP_KERNEL); if (!namebuf) goto out; length = -EINVAL; nargs = sscanf(buf, "%s %s %hu %s", scon, tcon, &tclass, namebuf); if (nargs < 3 || nargs > 4) goto out; if (nargs == 4) { /* * If and when the name of new object to be queried contains * either whitespace or multibyte characters, they shall be * encoded based on the percentage-encoding rule. * If not encoded, the sscanf logic picks up only left-half * of the supplied name; split by a whitespace unexpectedly. */ char *r, *w; int c1, c2; r = w = namebuf; do { c1 = *r++; if (c1 == '+') c1 = ' '; else if (c1 == '%') { c1 = hex_to_bin(*r++); if (c1 < 0) goto out; c2 = hex_to_bin(*r++); if (c2 < 0) goto out; c1 = (c1 << 4) | c2; } *w++ = c1; } while (c1 != '\0'); objname = namebuf; } length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; length = security_transition_sid_user(ssid, tsid, tclass, objname, &newsid); if (length) goto out; length = security_sid_to_context(newsid, &newcon, &len); if (length) goto out; length = -ERANGE; if (len > SIMPLE_TRANSACTION_LIMIT) { pr_err("SELinux: %s: context size (%u) exceeds " "payload max\n", __func__, len); goto out; } memcpy(buf, newcon, len); length = len; out: kfree(newcon); kfree(namebuf); kfree(tcon); kfree(scon); return length; } static ssize_t sel_write_relabel(struct file *file, char *buf, size_t size) { char *scon = NULL, *tcon = NULL; u32 ssid, tsid, newsid; u16 tclass; ssize_t length; char *newcon = NULL; u32 len; length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_RELABEL, NULL); if (length) goto out; length = -ENOMEM; scon = kzalloc(size + 1, GFP_KERNEL); if (!scon) goto out; length = -ENOMEM; tcon = kzalloc(size + 1, GFP_KERNEL); if (!tcon) goto out; length = -EINVAL; if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; length = security_change_sid(ssid, tsid, tclass, &newsid); if (length) goto out; length = security_sid_to_context(newsid, &newcon, &len); if (length) goto out; length = -ERANGE; if (len > SIMPLE_TRANSACTION_LIMIT) goto out; memcpy(buf, newcon, len); length = len; out: kfree(newcon); kfree(tcon); kfree(scon); return length; } static ssize_t sel_write_user(struct file *file, char *buf, size_t size) { char *con = NULL, *user = NULL, *ptr; u32 sid, *sids = NULL; ssize_t length; char *newcon; int rc; u32 i, len, nsids; pr_warn_ratelimited("SELinux: %s (%d) wrote to /sys/fs/selinux/user!" " This will not be supported in the future; please update your" " userspace.\n", current->comm, current->pid); ssleep(5); length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_USER, NULL); if (length) goto out; length = -ENOMEM; con = kzalloc(size + 1, GFP_KERNEL); if (!con) goto out; length = -ENOMEM; user = kzalloc(size + 1, GFP_KERNEL); if (!user) goto out; length = -EINVAL; if (sscanf(buf, "%s %s", con, user) != 2) goto out; length = security_context_str_to_sid(con, &sid, GFP_KERNEL); if (length) goto out; length = security_get_user_sids(sid, user, &sids, &nsids); if (length) goto out; length = sprintf(buf, "%u", nsids) + 1; ptr = buf + length; for (i = 0; i < nsids; i++) { rc = security_sid_to_context(sids[i], &newcon, &len); if (rc) { length = rc; goto out; } if ((length + len) >= SIMPLE_TRANSACTION_LIMIT) { kfree(newcon); length = -ERANGE; goto out; } memcpy(ptr, newcon, len); kfree(newcon); ptr += len; length += len; } out: kfree(sids); kfree(user); kfree(con); return length; } static ssize_t sel_write_member(struct file *file, char *buf, size_t size) { char *scon = NULL, *tcon = NULL; u32 ssid, tsid, newsid; u16 tclass; ssize_t length; char *newcon = NULL; u32 len; length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_MEMBER, NULL); if (length) goto out; length = -ENOMEM; scon = kzalloc(size + 1, GFP_KERNEL); if (!scon) goto out; length = -ENOMEM; tcon = kzalloc(size + 1, GFP_KERNEL); if (!tcon) goto out; length = -EINVAL; if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; length = security_member_sid(ssid, tsid, tclass, &newsid); if (length) goto out; length = security_sid_to_context(newsid, &newcon, &len); if (length) goto out; length = -ERANGE; if (len > SIMPLE_TRANSACTION_LIMIT) { pr_err("SELinux: %s: context size (%u) exceeds " "payload max\n", __func__, len); goto out; } memcpy(buf, newcon, len); length = len; out: kfree(newcon); kfree(tcon); kfree(scon); return length; } static struct inode *sel_make_inode(struct super_block *sb, umode_t mode) { struct inode *ret = new_inode(sb); if (ret) { ret->i_mode = mode; simple_inode_init_ts(ret); } return ret; } static ssize_t sel_read_bool(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { struct selinux_fs_info *fsi = file_inode(filep)->i_sb->s_fs_info; char buffer[4]; ssize_t length; ssize_t ret; int cur_enforcing; unsigned index = file_inode(filep)->i_ino & SEL_INO_MASK; const char *name = filep->f_path.dentry->d_name.name; mutex_lock(&selinux_state.policy_mutex); ret = -EINVAL; if (index >= fsi->bool_num || strcmp(name, fsi->bool_pending_names[index])) goto out_unlock; cur_enforcing = security_get_bool_value(index); if (cur_enforcing < 0) { ret = cur_enforcing; goto out_unlock; } length = scnprintf(buffer, sizeof(buffer), "%d %d", !!cur_enforcing, !!fsi->bool_pending_values[index]); mutex_unlock(&selinux_state.policy_mutex); return simple_read_from_buffer(buf, count, ppos, buffer, length); out_unlock: mutex_unlock(&selinux_state.policy_mutex); return ret; } static ssize_t sel_write_bool(struct file *filep, const char __user *buf, size_t count, loff_t *ppos) { struct selinux_fs_info *fsi = file_inode(filep)->i_sb->s_fs_info; char *page = NULL; ssize_t length; int new_value; unsigned index = file_inode(filep)->i_ino & SEL_INO_MASK; const char *name = filep->f_path.dentry->d_name.name; if (count >= PAGE_SIZE) return -ENOMEM; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user_nul(buf, count); if (IS_ERR(page)) return PTR_ERR(page); mutex_lock(&selinux_state.policy_mutex); length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETBOOL, NULL); if (length) goto out; length = -EINVAL; if (index >= fsi->bool_num || strcmp(name, fsi->bool_pending_names[index])) goto out; length = -EINVAL; if (sscanf(page, "%d", &new_value) != 1) goto out; if (new_value) new_value = 1; fsi->bool_pending_values[index] = new_value; length = count; out: mutex_unlock(&selinux_state.policy_mutex); kfree(page); return length; } static const struct file_operations sel_bool_ops = { .read = sel_read_bool, .write = sel_write_bool, .llseek = generic_file_llseek, }; static ssize_t sel_commit_bools_write(struct file *filep, const char __user *buf, size_t count, loff_t *ppos) { struct selinux_fs_info *fsi = file_inode(filep)->i_sb->s_fs_info; char *page = NULL; ssize_t length; int new_value; if (count >= PAGE_SIZE) return -ENOMEM; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user_nul(buf, count); if (IS_ERR(page)) return PTR_ERR(page); mutex_lock(&selinux_state.policy_mutex); length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETBOOL, NULL); if (length) goto out; length = -EINVAL; if (sscanf(page, "%d", &new_value) != 1) goto out; length = 0; if (new_value && fsi->bool_pending_values) length = security_set_bools(fsi->bool_num, fsi->bool_pending_values); if (!length) length = count; out: mutex_unlock(&selinux_state.policy_mutex); kfree(page); return length; } static const struct file_operations sel_commit_bools_ops = { .write = sel_commit_bools_write, .llseek = generic_file_llseek, }; static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir, unsigned int *bool_num, char ***bool_pending_names, int **bool_pending_values) { int ret; char **names, *page; u32 i, num; page = (char *)get_zeroed_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = security_get_bools(newpolicy, &num, &names, bool_pending_values); if (ret) goto out; *bool_num = num; *bool_pending_names = names; for (i = 0; i < num; i++) { struct dentry *dentry; struct inode *inode; struct inode_security_struct *isec; ssize_t len; u32 sid; len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]); if (len >= PAGE_SIZE) { ret = -ENAMETOOLONG; break; } dentry = d_alloc_name(bool_dir, names[i]); if (!dentry) { ret = -ENOMEM; break; } inode = sel_make_inode(bool_dir->d_sb, S_IFREG | S_IRUGO | S_IWUSR); if (!inode) { dput(dentry); ret = -ENOMEM; break; } isec = selinux_inode(inode); ret = selinux_policy_genfs_sid(newpolicy, "selinuxfs", page, SECCLASS_FILE, &sid); if (ret) { pr_warn_ratelimited("SELinux: no sid found, defaulting to security isid for %s\n", page); sid = SECINITSID_SECURITY; } isec->sid = sid; isec->initialized = LABEL_INITIALIZED; inode->i_fop = &sel_bool_ops; inode->i_ino = i|SEL_BOOL_INO_OFFSET; d_add(dentry, inode); } out: free_page((unsigned long)page); return ret; } static ssize_t sel_read_avc_cache_threshold(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", avc_get_cache_threshold()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t sel_write_avc_cache_threshold(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *page; ssize_t ret; unsigned int new_value; ret = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETSECPARAM, NULL); if (ret) return ret; if (count >= PAGE_SIZE) return -ENOMEM; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user_nul(buf, count); if (IS_ERR(page)) return PTR_ERR(page); ret = -EINVAL; if (sscanf(page, "%u", &new_value) != 1) goto out; avc_set_cache_threshold(new_value); ret = count; out: kfree(page); return ret; } static ssize_t sel_read_avc_hash_stats(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char *page; ssize_t length; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; length = avc_get_hash_stats(page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, page, length); free_page((unsigned long)page); return length; } static ssize_t sel_read_sidtab_hash_stats(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char *page; ssize_t length; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; length = security_sidtab_hash_stats(page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, page, length); free_page((unsigned long)page); return length; } static const struct file_operations sel_sidtab_hash_stats_ops = { .read = sel_read_sidtab_hash_stats, .llseek = generic_file_llseek, }; static const struct file_operations sel_avc_cache_threshold_ops = { .read = sel_read_avc_cache_threshold, .write = sel_write_avc_cache_threshold, .llseek = generic_file_llseek, }; static const struct file_operations sel_avc_hash_stats_ops = { .read = sel_read_avc_hash_stats, .llseek = generic_file_llseek, }; #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS static struct avc_cache_stats *sel_avc_get_stat_idx(loff_t *idx) { loff_t cpu; for (cpu = *idx; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *idx = cpu + 1; return &per_cpu(avc_cache_stats, cpu); } (*idx)++; return NULL; } static void *sel_avc_stats_seq_start(struct seq_file *seq, loff_t *pos) { loff_t n = *pos - 1; if (*pos == 0) return SEQ_START_TOKEN; return sel_avc_get_stat_idx(&n); } static void *sel_avc_stats_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return sel_avc_get_stat_idx(pos); } static int sel_avc_stats_seq_show(struct seq_file *seq, void *v) { struct avc_cache_stats *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "lookups hits misses allocations reclaims frees\n"); } else { unsigned int lookups = st->lookups; unsigned int misses = st->misses; unsigned int hits = lookups - misses; seq_printf(seq, "%u %u %u %u %u %u\n", lookups, hits, misses, st->allocations, st->reclaims, st->frees); } return 0; } static void sel_avc_stats_seq_stop(struct seq_file *seq, void *v) { } static const struct seq_operations sel_avc_cache_stats_seq_ops = { .start = sel_avc_stats_seq_start, .next = sel_avc_stats_seq_next, .show = sel_avc_stats_seq_show, .stop = sel_avc_stats_seq_stop, }; static int sel_open_avc_cache_stats(struct inode *inode, struct file *file) { return seq_open(file, &sel_avc_cache_stats_seq_ops); } static const struct file_operations sel_avc_cache_stats_ops = { .open = sel_open_avc_cache_stats, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; #endif static int sel_make_avc_files(struct dentry *dir) { struct super_block *sb = dir->d_sb; struct selinux_fs_info *fsi = sb->s_fs_info; unsigned int i; static const struct tree_descr files[] = { { "cache_threshold", &sel_avc_cache_threshold_ops, S_IRUGO|S_IWUSR }, { "hash_stats", &sel_avc_hash_stats_ops, S_IRUGO }, #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS { "cache_stats", &sel_avc_cache_stats_ops, S_IRUGO }, #endif }; for (i = 0; i < ARRAY_SIZE(files); i++) { struct inode *inode; struct dentry *dentry; dentry = d_alloc_name(dir, files[i].name); if (!dentry) return -ENOMEM; inode = sel_make_inode(dir->d_sb, S_IFREG|files[i].mode); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_fop = files[i].ops; inode->i_ino = ++fsi->last_ino; d_add(dentry, inode); } return 0; } static int sel_make_ss_files(struct dentry *dir) { struct super_block *sb = dir->d_sb; struct selinux_fs_info *fsi = sb->s_fs_info; unsigned int i; static const struct tree_descr files[] = { { "sidtab_hash_stats", &sel_sidtab_hash_stats_ops, S_IRUGO }, }; for (i = 0; i < ARRAY_SIZE(files); i++) { struct inode *inode; struct dentry *dentry; dentry = d_alloc_name(dir, files[i].name); if (!dentry) return -ENOMEM; inode = sel_make_inode(dir->d_sb, S_IFREG|files[i].mode); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_fop = files[i].ops; inode->i_ino = ++fsi->last_ino; d_add(dentry, inode); } return 0; } static ssize_t sel_read_initcon(struct file *file, char __user *buf, size_t count, loff_t *ppos) { char *con; u32 sid, len; ssize_t ret; sid = file_inode(file)->i_ino&SEL_INO_MASK; ret = security_sid_to_context(sid, &con, &len); if (ret) return ret; ret = simple_read_from_buffer(buf, count, ppos, con, len); kfree(con); return ret; } static const struct file_operations sel_initcon_ops = { .read = sel_read_initcon, .llseek = generic_file_llseek, }; static int sel_make_initcon_files(struct dentry *dir) { unsigned int i; for (i = 1; i <= SECINITSID_NUM; i++) { struct inode *inode; struct dentry *dentry; const char *s = security_get_initial_sid_context(i); if (!s) continue; dentry = d_alloc_name(dir, s); if (!dentry) return -ENOMEM; inode = sel_make_inode(dir->d_sb, S_IFREG|S_IRUGO); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_fop = &sel_initcon_ops; inode->i_ino = i|SEL_INITCON_INO_OFFSET; d_add(dentry, inode); } return 0; } static inline unsigned long sel_class_to_ino(u16 class) { return (class * (SEL_VEC_MAX + 1)) | SEL_CLASS_INO_OFFSET; } static inline u16 sel_ino_to_class(unsigned long ino) { return (ino & SEL_INO_MASK) / (SEL_VEC_MAX + 1); } static inline unsigned long sel_perm_to_ino(u16 class, u32 perm) { return (class * (SEL_VEC_MAX + 1) + perm) | SEL_CLASS_INO_OFFSET; } static inline u32 sel_ino_to_perm(unsigned long ino) { return (ino & SEL_INO_MASK) % (SEL_VEC_MAX + 1); } static ssize_t sel_read_class(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long ino = file_inode(file)->i_ino; char res[TMPBUFLEN]; ssize_t len = scnprintf(res, sizeof(res), "%d", sel_ino_to_class(ino)); return simple_read_from_buffer(buf, count, ppos, res, len); } static const struct file_operations sel_class_ops = { .read = sel_read_class, .llseek = generic_file_llseek, }; static ssize_t sel_read_perm(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long ino = file_inode(file)->i_ino; char res[TMPBUFLEN]; ssize_t len = scnprintf(res, sizeof(res), "%d", sel_ino_to_perm(ino)); return simple_read_from_buffer(buf, count, ppos, res, len); } static const struct file_operations sel_perm_ops = { .read = sel_read_perm, .llseek = generic_file_llseek, }; static ssize_t sel_read_policycap(struct file *file, char __user *buf, size_t count, loff_t *ppos) { int value; char tmpbuf[TMPBUFLEN]; ssize_t length; unsigned long i_ino = file_inode(file)->i_ino; value = security_policycap_supported(i_ino & SEL_INO_MASK); length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations sel_policycap_ops = { .read = sel_read_policycap, .llseek = generic_file_llseek, }; static int sel_make_perm_files(struct selinux_policy *newpolicy, char *objclass, int classvalue, struct dentry *dir) { u32 i, nperms; int rc; char **perms; rc = security_get_permissions(newpolicy, objclass, &perms, &nperms); if (rc) return rc; for (i = 0; i < nperms; i++) { struct inode *inode; struct dentry *dentry; rc = -ENOMEM; dentry = d_alloc_name(dir, perms[i]); if (!dentry) goto out; rc = -ENOMEM; inode = sel_make_inode(dir->d_sb, S_IFREG|S_IRUGO); if (!inode) { dput(dentry); goto out; } inode->i_fop = &sel_perm_ops; /* i+1 since perm values are 1-indexed */ inode->i_ino = sel_perm_to_ino(classvalue, i + 1); d_add(dentry, inode); } rc = 0; out: for (i = 0; i < nperms; i++) kfree(perms[i]); kfree(perms); return rc; } static int sel_make_class_dir_entries(struct selinux_policy *newpolicy, char *classname, int index, struct dentry *dir) { struct super_block *sb = dir->d_sb; struct selinux_fs_info *fsi = sb->s_fs_info; struct dentry *dentry = NULL; struct inode *inode = NULL; dentry = d_alloc_name(dir, "index"); if (!dentry) return -ENOMEM; inode = sel_make_inode(dir->d_sb, S_IFREG|S_IRUGO); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_fop = &sel_class_ops; inode->i_ino = sel_class_to_ino(index); d_add(dentry, inode); dentry = sel_make_dir(dir, "perms", &fsi->last_class_ino); if (IS_ERR(dentry)) return PTR_ERR(dentry); return sel_make_perm_files(newpolicy, classname, index, dentry); } static int sel_make_classes(struct selinux_policy *newpolicy, struct dentry *class_dir, unsigned long *last_class_ino) { u32 i, nclasses; int rc; char **classes; rc = security_get_classes(newpolicy, &classes, &nclasses); if (rc) return rc; /* +2 since classes are 1-indexed */ *last_class_ino = sel_class_to_ino(nclasses + 2); for (i = 0; i < nclasses; i++) { struct dentry *class_name_dir; class_name_dir = sel_make_dir(class_dir, classes[i], last_class_ino); if (IS_ERR(class_name_dir)) { rc = PTR_ERR(class_name_dir); goto out; } /* i+1 since class values are 1-indexed */ rc = sel_make_class_dir_entries(newpolicy, classes[i], i + 1, class_name_dir); if (rc) goto out; } rc = 0; out: for (i = 0; i < nclasses; i++) kfree(classes[i]); kfree(classes); return rc; } static int sel_make_policycap(struct selinux_fs_info *fsi) { unsigned int iter; struct dentry *dentry = NULL; struct inode *inode = NULL; for (iter = 0; iter <= POLICYDB_CAP_MAX; iter++) { if (iter < ARRAY_SIZE(selinux_policycap_names)) dentry = d_alloc_name(fsi->policycap_dir, selinux_policycap_names[iter]); else dentry = d_alloc_name(fsi->policycap_dir, "unknown"); if (dentry == NULL) return -ENOMEM; inode = sel_make_inode(fsi->sb, S_IFREG | 0444); if (inode == NULL) { dput(dentry); return -ENOMEM; } inode->i_fop = &sel_policycap_ops; inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET; d_add(dentry, inode); } return 0; } static struct dentry *sel_make_dir(struct dentry *dir, const char *name, unsigned long *ino) { struct dentry *dentry = d_alloc_name(dir, name); struct inode *inode; if (!dentry) return ERR_PTR(-ENOMEM); inode = sel_make_inode(dir->d_sb, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) { dput(dentry); return ERR_PTR(-ENOMEM); } inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_ino = ++(*ino); /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_add(dentry, inode); /* bump link count on parent directory, too */ inc_nlink(d_inode(dir)); return dentry; } static int reject_all(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EPERM; // no access for anyone, root or no root. } static const struct inode_operations swapover_dir_inode_operations = { .lookup = simple_lookup, .permission = reject_all, }; static struct dentry *sel_make_swapover_dir(struct super_block *sb, unsigned long *ino) { struct dentry *dentry = d_alloc_name(sb->s_root, ".swapover"); struct inode *inode; if (!dentry) return ERR_PTR(-ENOMEM); inode = sel_make_inode(sb, S_IFDIR); if (!inode) { dput(dentry); return ERR_PTR(-ENOMEM); } inode->i_op = &swapover_dir_inode_operations; inode->i_ino = ++(*ino); /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); inode_lock(sb->s_root->d_inode); d_add(dentry, inode); inc_nlink(sb->s_root->d_inode); inode_unlock(sb->s_root->d_inode); return dentry; } #define NULL_FILE_NAME "null" static int sel_fill_super(struct super_block *sb, struct fs_context *fc) { struct selinux_fs_info *fsi; int ret; struct dentry *dentry; struct inode *inode; struct inode_security_struct *isec; static const struct tree_descr selinux_files[] = { [SEL_LOAD] = {"load", &sel_load_ops, S_IRUSR|S_IWUSR}, [SEL_ENFORCE] = {"enforce", &sel_enforce_ops, S_IRUGO|S_IWUSR}, [SEL_CONTEXT] = {"context", &transaction_ops, S_IRUGO|S_IWUGO}, [SEL_ACCESS] = {"access", &transaction_ops, S_IRUGO|S_IWUGO}, [SEL_CREATE] = {"create", &transaction_ops, S_IRUGO|S_IWUGO}, [SEL_RELABEL] = {"relabel", &transaction_ops, S_IRUGO|S_IWUGO}, [SEL_USER] = {"user", &transaction_ops, S_IRUGO|S_IWUGO}, [SEL_POLICYVERS] = {"policyvers", &sel_policyvers_ops, S_IRUGO}, [SEL_COMMIT_BOOLS] = {"commit_pending_bools", &sel_commit_bools_ops, S_IWUSR}, [SEL_MLS] = {"mls", &sel_mls_ops, S_IRUGO}, [SEL_DISABLE] = {"disable", &sel_disable_ops, S_IWUSR}, [SEL_MEMBER] = {"member", &transaction_ops, S_IRUGO|S_IWUGO}, [SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR}, [SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO}, [SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO}, [SEL_STATUS] = {"status", &sel_handle_status_ops, S_IRUGO}, [SEL_POLICY] = {"policy", &sel_policy_ops, S_IRUGO}, [SEL_VALIDATE_TRANS] = {"validatetrans", &sel_transition_ops, S_IWUGO}, /* last one */ {"", NULL, 0} }; ret = selinux_fs_info_create(sb); if (ret) goto err; ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files); if (ret) goto err; fsi = sb->s_fs_info; fsi->bool_dir = sel_make_dir(sb->s_root, BOOL_DIR_NAME, &fsi->last_ino); if (IS_ERR(fsi->bool_dir)) { ret = PTR_ERR(fsi->bool_dir); fsi->bool_dir = NULL; goto err; } ret = -ENOMEM; dentry = d_alloc_name(sb->s_root, NULL_FILE_NAME); if (!dentry) goto err; ret = -ENOMEM; inode = sel_make_inode(sb, S_IFCHR | S_IRUGO | S_IWUGO); if (!inode) { dput(dentry); goto err; } inode->i_ino = ++fsi->last_ino; isec = selinux_inode(inode); isec->sid = SECINITSID_DEVNULL; isec->sclass = SECCLASS_CHR_FILE; isec->initialized = LABEL_INITIALIZED; init_special_inode(inode, S_IFCHR | S_IRUGO | S_IWUGO, MKDEV(MEM_MAJOR, 3)); d_add(dentry, inode); dentry = sel_make_dir(sb->s_root, "avc", &fsi->last_ino); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto err; } ret = sel_make_avc_files(dentry); if (ret) goto err; dentry = sel_make_dir(sb->s_root, "ss", &fsi->last_ino); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto err; } ret = sel_make_ss_files(dentry); if (ret) goto err; dentry = sel_make_dir(sb->s_root, "initial_contexts", &fsi->last_ino); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto err; } ret = sel_make_initcon_files(dentry); if (ret) goto err; fsi->class_dir = sel_make_dir(sb->s_root, CLASS_DIR_NAME, &fsi->last_ino); if (IS_ERR(fsi->class_dir)) { ret = PTR_ERR(fsi->class_dir); fsi->class_dir = NULL; goto err; } fsi->policycap_dir = sel_make_dir(sb->s_root, POLICYCAP_DIR_NAME, &fsi->last_ino); if (IS_ERR(fsi->policycap_dir)) { ret = PTR_ERR(fsi->policycap_dir); fsi->policycap_dir = NULL; goto err; } ret = sel_make_policycap(fsi); if (ret) { pr_err("SELinux: failed to load policy capabilities\n"); goto err; } return 0; err: pr_err("SELinux: %s: failed while creating inodes\n", __func__); return ret; } static int sel_get_tree(struct fs_context *fc) { return get_tree_single(fc, sel_fill_super); } static const struct fs_context_operations sel_context_ops = { .get_tree = sel_get_tree, }; static int sel_init_fs_context(struct fs_context *fc) { fc->ops = &sel_context_ops; return 0; } static void sel_kill_sb(struct super_block *sb) { selinux_fs_info_free(sb); kill_litter_super(sb); } static struct file_system_type sel_fs_type = { .name = "selinuxfs", .init_fs_context = sel_init_fs_context, .kill_sb = sel_kill_sb, }; struct path selinux_null __ro_after_init; static int __init init_sel_fs(void) { struct qstr null_name = QSTR_INIT(NULL_FILE_NAME, sizeof(NULL_FILE_NAME)-1); int err; if (!selinux_enabled_boot) return 0; err = sysfs_create_mount_point(fs_kobj, "selinux"); if (err) return err; err = register_filesystem(&sel_fs_type); if (err) { sysfs_remove_mount_point(fs_kobj, "selinux"); return err; } selinux_null.mnt = kern_mount(&sel_fs_type); if (IS_ERR(selinux_null.mnt)) { pr_err("selinuxfs: could not mount!\n"); err = PTR_ERR(selinux_null.mnt); selinux_null.mnt = NULL; return err; } selinux_null.dentry = try_lookup_noperm(&null_name, selinux_null.mnt->mnt_root); if (IS_ERR(selinux_null.dentry)) { pr_err("selinuxfs: could not lookup null!\n"); err = PTR_ERR(selinux_null.dentry); selinux_null.dentry = NULL; return err; } /* * Try to pre-allocate the status page, so the sequence number of the * initial policy load can be stored. */ (void) selinux_kernel_status_page(); return err; } __initcall(init_sel_fs);
56 12 54 44 30 15 54 1 41 16 4 54 54 54 6 6 6 4 4 3 94 93 1 89 2 6 1 1 87 31 57 57 57 57 53 3 6 4 94 94 93 91 91 30 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" /* LINKMODES_GET */ struct linkmodes_req_info { struct ethnl_req_info base; }; struct linkmodes_reply_data { struct ethnl_reply_data base; struct ethtool_link_ksettings ksettings; struct ethtool_link_settings *lsettings; bool peer_empty; }; #define LINKMODES_REPDATA(__reply_base) \ container_of(__reply_base, struct linkmodes_reply_data, base) const struct nla_policy ethnl_linkmodes_get_policy[] = { [ETHTOOL_A_LINKMODES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; data->lsettings = &data->ksettings.base; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out; } if (!dev->ethtool_ops->cap_link_lanes_supported) data->ksettings.lanes = 0; data->peer_empty = bitmap_empty(data->ksettings.link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); out: ethnl_ops_complete(dev); return ret; } static int linkmodes_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len, ret; len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */ + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */ + nla_total_size(sizeof(u8)) /* LINKMODES_RATE_MATCHING */ + 0; ret = ethnl_bitset_size(ksettings->link_modes.advertising, ksettings->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return ret; len += ret; if (!data->peer_empty) { ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising, NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return ret; len += ret; } if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED) len += nla_total_size(sizeof(u8)); if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED) len += nla_total_size(sizeof(u8)); return len; } static int linkmodes_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg)) return -EMSGSIZE; ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS, ksettings->link_modes.advertising, ksettings->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return -EMSGSIZE; if (!data->peer_empty) { ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER, ksettings->link_modes.lp_advertising, NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return -EMSGSIZE; } if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) || nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) return -EMSGSIZE; if (ksettings->lanes && nla_put_u32(skb, ETHTOOL_A_LINKMODES_LANES, ksettings->lanes)) return -EMSGSIZE; if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED && nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, lsettings->master_slave_cfg)) return -EMSGSIZE; if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED && nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, lsettings->master_slave_state)) return -EMSGSIZE; if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_RATE_MATCHING, lsettings->rate_matching)) return -EMSGSIZE; return 0; } /* LINKMODES_SET */ const struct nla_policy ethnl_linkmodes_set_policy[] = { [ETHTOOL_A_LINKMODES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_LANES] = NLA_POLICY_RANGE(NLA_U32, 1, 8), }; /* Set advertised link modes to all supported modes matching requested speed, * lanes and duplex values. Called when autonegotiation is on, speed, lanes or * duplex is requested but no link mode change. This is done in userspace with * ioctl() interface, move it into kernel for netlink. * Returns true if advertised modes bitmap was modified. */ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, bool req_speed, bool req_lanes, bool req_duplex) { unsigned long *advertising = ksettings->link_modes.advertising; unsigned long *supported = ksettings->link_modes.supported; DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int i; bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) { const struct link_mode_info *info = &link_mode_params[i]; if (info->speed == SPEED_UNKNOWN) continue; if (test_bit(i, supported) && (!req_speed || info->speed == ksettings->base.speed) && (!req_lanes || info->lanes == ksettings->lanes) && (!req_duplex || info->duplex == ksettings->base.duplex)) set_bit(i, advertising); else clear_bit(i, advertising); } return !bitmap_equal(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); } static bool ethnl_validate_master_slave_cfg(u8 cfg) { switch (cfg) { case MASTER_SLAVE_CFG_MASTER_PREFERRED: case MASTER_SLAVE_CFG_SLAVE_PREFERRED: case MASTER_SLAVE_CFG_MASTER_FORCE: case MASTER_SLAVE_CFG_SLAVE_FORCE: return true; } return false; } static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb) { const struct nlattr *master_slave_cfg, *lanes_cfg; master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; if (master_slave_cfg && !ethnl_validate_master_slave_cfg(nla_get_u8(master_slave_cfg))) { NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, "master/slave value is invalid"); return -EOPNOTSUPP; } lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; if (lanes_cfg && !is_power_of_2(nla_get_u32(lanes_cfg))) { NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, "lanes value is invalid"); return -EINVAL; } return 0; } static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod, const struct net_device *dev) { struct ethtool_link_settings *lsettings = &ksettings->base; bool req_speed, req_lanes, req_duplex; const struct nlattr *master_slave_cfg, *lanes_cfg; int ret; master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; if (master_slave_cfg) { if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) { NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, "master/slave configuration not supported by device"); return -EOPNOTSUPP; } } *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; req_lanes = tb[ETHTOOL_A_LINKMODES_LANES]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG], mod); lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; if (lanes_cfg) { /* If autoneg is off and lanes parameter is not supported by the * driver, return an error. */ if (!lsettings->autoneg && !dev->ethtool_ops->cap_link_lanes_supported) { NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, "lanes configuration not supported by device"); return -EOPNOTSUPP; } } else if (!lsettings->autoneg && ksettings->lanes) { /* If autoneg is off and lanes parameter is not passed from user but * it was defined previously then set the lanes parameter to 0. */ ksettings->lanes = 0; *mod = true; } ret = ethnl_update_bitset(ksettings->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS, tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names, info->extack, mod); if (ret < 0) return ret; ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED], mod); ethnl_update_u32(&ksettings->lanes, lanes_cfg, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && (req_speed || req_lanes || req_duplex) && ethnl_auto_linkmodes(ksettings, req_speed, req_lanes, req_duplex)) *mod = true; return 0; } static int ethnl_set_linkmodes_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; int ret; ret = ethnl_check_linkmodes(info, info->attrs); if (ret < 0) return ret; if (!ops->get_link_ksettings || !ops->set_link_ksettings) return -EOPNOTSUPP; return 1; } static int ethnl_set_linkmodes(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); return ret; } ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev); if (ret < 0) return ret; if (!mod) return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); return ret; } return 1; } const struct ethnl_request_ops ethnl_linkmodes_request_ops = { .request_cmd = ETHTOOL_MSG_LINKMODES_GET, .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, .req_info_size = sizeof(struct linkmodes_req_info), .reply_data_size = sizeof(struct linkmodes_reply_data), .prepare_data = linkmodes_prepare_data, .reply_size = linkmodes_reply_size, .fill_reply = linkmodes_fill_reply, .set_validate = ethnl_set_linkmodes_validate, .set = ethnl_set_linkmodes, .set_ntf_cmd = ETHTOOL_MSG_LINKMODES_NTF, };
6 6 6 6 7 7 7 7 7 7 5 4 5 1 1 3 3 2 2 2 4 7 5 7 6 11 11 11 6 7 7 1 6 7 7 1 7 434 435 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle incoming frames * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE #include <net/netfilter/nf_queue.h> #endif #include <linux/neighbour.h> #include <net/arp.h> #include <net/dsa.h> #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" #include "br_private_tunnel.h" static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { br_drop_fake_rtable(skb); return netif_receive_skb(skb); } static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); struct net_bridge_vlan_group *vg; dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); /* Reset the offload_fwd_mark because there could be a stacked * bridge above, and it should not think this bridge it doing * that bridge's work forwarding out its ports. */ br_switchdev_frame_unmark(skb); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. */ if (!(brdev->flags & IFF_PROMISC) && !br_allowed_egress(vg, skb)) { kfree_skb(skb); return NET_RX_DROP; } indev = skb->dev; skb->dev = brdev; skb = br_handle_vlan(br, NULL, vg, skb); if (!skb) return NET_RX_DROP; /* update the multicast stats if the packet is IGMP/MLD */ br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); BR_INPUT_SKB_CB(skb)->promisc = promisc; return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); } /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; bool promisc; u16 vid = 0; u8 state; if (!p) goto drop; br = p->br; if (br_mst_is_enabled(p)) { state = BR_STATE_FORWARDING; } else { if (p->state == BR_STATE_DISABLED) { reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; } state = p->state; } brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, &state, &vlan)) goto out; if (p->flags & BR_PORT_LOCKED) { struct net_bridge_fdb_entry *fdb_src = br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); if (!fdb_src) { /* FDB miss. Create locked FDB entry if MAB is enabled * and drop the packet. */ if (p->flags & BR_PORT_MAB) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } else if (READ_ONCE(fdb_src->dst) != p || test_bit(BR_FDB_LOCAL, &fdb_src->flags)) { /* FDB mismatch. Drop the packet without roaming. */ goto drop; } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) { /* FDB match, but entry is locked. Refresh it and drop * the packet. */ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } } nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); promisc = !!(br->dev->flags & IFF_PROMISC); local_rcv = promisc; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { pkt_type = BR_PKT_BROADCAST; local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } if (state == BR_STATE_LEARNING) { reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; } BR_INPUT_SKB_CB(skb)->brdev = br->dev; BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); if (IS_ENABLED(CONFIG_INET) && (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *msg, _msg; msg = br_is_nd_neigh_msg(skb, &_msg); if (msg) br_do_suppress_nd(skb, br, vid, p, msg); } switch (pkt_type) { case BR_PKT_MULTICAST: mdst = br_mdb_entry_skb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || br_multicast_is_router(brmctx, skb) || br->dev->flags & IFF_ALLMULTI) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } mcast_hit = true; } else { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); if (unlikely(!dst && vid && br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) { dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0); if (dst && (!test_bit(BR_FDB_LOCAL, &dst->flags) || test_bit(BR_FDB_ADDED_BY_USER, &dst->flags))) dst = NULL; } break; default: break; } if (dst) { unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false, vid); else br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) return br_pass_frame_up(skb, promisc); out: return 0; drop: kfree_skb_reason(skb, reason); goto out; } EXPORT_SYMBOL_GPL(br_handle_frame_finish); static void __br_handle_local_finish(struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { __br_handle_local_finish(skb); /* return 1 to signal the okfn() was called so it's ok to use the skb */ return 1; } static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries *e = NULL; struct nf_hook_state state; unsigned int verdict, i; struct net *net; int ret; net = dev_net(skb->dev); #ifdef HAVE_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) goto frame_finish; #endif e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; nf_hook_state_init(&state, NF_BR_PRE_ROUTING, NFPROTO_BRIDGE, skb->dev, NULL, NULL, net, br_handle_frame_finish); for (i = 0; i < e->num_hook_entries; i++) { verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) { *pskb = skb; return RX_HANDLER_PASS; } break; case NF_DROP: kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; default: /* STOLEN */ return RX_HANDLER_CONSUMED; } } frame_finish: net = dev_net(skb->dev); br_handle_frame_finish(net, NULL, skb); #else br_handle_frame_finish(dev_net(skb->dev), NULL, skb); #endif return RX_HANDLER_CONSUMED; } /* Return 0 if the frame was not processed otherwise 1 * note: already called with rcu_read_lock */ static int br_process_frame_type(struct net_bridge_port *p, struct sk_buff *skb) { struct br_frame_type *tmp; hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list) if (unlikely(tmp->type == skb->protocol)) return tmp->frame_handler(p, skb); return 0; } /* * Return NULL if skb is handled * note: already called with rcu_read_lock */ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) { reason = SKB_DROP_REASON_MAC_INVALID_SOURCE; goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); br_tc_skb_miss_set(skb, false); p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p)); if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; /* * See IEEE 802.1D Table 7-10 Reserved addresses * * Assignment Value * Bridge Group Address 01-80-C2-00-00-00 * (MAC Control) 802.3 01-80-C2-00-00-01 * (Link Aggregation) 802.3 01-80-C2-00-00-02 * 802.1X PAE address 01-80-C2-00-00-03 * * 802.1AB LLDP 01-80-C2-00-00-0E * * Others reserved for future standardization */ fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, then must forward to keep loop detection */ if (p->br->stp_enabled == BR_NO_STP || fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL; goto drop; case 0x0E: /* 802.1AB LLDP */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; default: /* Allow selective forwarding for most other protocols */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; } BR_INPUT_SKB_CB(skb)->promisc = false; /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) * Thus return 1 from the okfn() to signal the skb is ok to pass */ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), NULL, skb, skb->dev, NULL, br_handle_local_finish) == 1) { return RX_HANDLER_PASS; } else { return RX_HANDLER_CONSUMED; } } if (unlikely(br_process_frame_type(p, skb))) return RX_HANDLER_PASS; forward: if (br_mst_is_enabled(p)) goto defer_stp_filtering; switch (p->state) { case BR_STATE_FORWARDING: case BR_STATE_LEARNING: defer_stp_filtering: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; return nf_hook_bridge_pre(skb, pskb); default: reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; drop: kfree_skb_reason(skb, reason); } return RX_HANDLER_CONSUMED; } /* This function has no purpose other than to appease the br_port_get_rcu/rtnl * helpers which identify bridged ports according to the rx_handler installed * on them (so there _needs_ to be a bridge rx_handler even if we don't need it * to do anything useful). This bridge won't support traffic to/from the stack, * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal * frames from the ETH_P_XDSA packet_type handler. */ static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb) { return RX_HANDLER_PASS; } rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) { if (netdev_uses_dsa(dev)) return br_handle_frame_dummy; return br_handle_frame; } void br_add_frame(struct net_bridge *br, struct br_frame_type *ft) { hlist_add_head_rcu(&ft->list, &br->frame_type_list); } void br_del_frame(struct net_bridge *br, struct br_frame_type *ft) { struct br_frame_type *tmp; hlist_for_each_entry(tmp, &br->frame_type_list, list) if (ft == tmp) { hlist_del_rcu(&ft->list); return; } }
116 7 2 2 104 4 73 35 114 99 23 1 1 2 1 18 27 7 6 18 25 18 2 16 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ #include <linux/slab.h> #include <linux/bpf.h> #include <linux/btf.h> #include "map_in_map.h" struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; CLASS(fd, f)(inner_map_ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) return ERR_PTR(-EINVAL); if (!inner_map->ops->map_meta_equal) return ERR_PTR(-ENOTSUPP); inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) return ERR_PTR(-ENOMEM); inner_map_meta->map_type = inner_map->map_type; inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->record = btf_record_dup(inner_map->record); if (IS_ERR(inner_map_meta->record)) { /* btf_record_dup returns NULL or valid pointer in case of * invalid/empty/valid, but ERR_PTR in case of errors. During * equality NULL or IS_ERR is equivalent. */ struct bpf_map *ret = ERR_CAST(inner_map_meta->record); kfree(inner_map_meta); return ret; } /* Note: We must use the same BTF, as we also used btf_record_dup above * which relies on BTF being same for both maps, as some members like * record->fields.list_head have pointers like value_rec pointing into * inner_map->btf. */ if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; } /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) { struct bpf_array *inner_array_meta = container_of(inner_map_meta, struct bpf_array, map); struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); inner_array_meta->index_mask = inner_array->index_mask; inner_array_meta->elem_size = inner_array->elem_size; inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; } return inner_map_meta; } void bpf_map_meta_free(struct bpf_map *map_meta) { bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); } bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { /* No need to compare ops because it is covered by map_type */ return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->map_flags == meta1->map_flags && btf_record_equal(meta0->record, meta1->record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { struct bpf_map *inner_map, *inner_map_meta; CLASS(fd, f)(ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; inner_map_meta = map->inner_map_meta; if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); return inner_map; } void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_map *inner_map = ptr; /* Defer the freeing of inner map according to the sleepable attribute * of bpf program which owns the outer map, so unnecessary waiting for * RCU tasks trace grace period can be avoided. */ if (need_defer) { if (atomic64_read(&map->sleepable_refcnt)) WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); else WRITE_ONCE(inner_map->free_after_rcu_gp, true); } bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) { return ((struct bpf_map *)ptr)->id; }
2 3 2 2 2 2 2 3 3 2 2 1 2 2 1 1 1 1 1 1 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 // SPDX-License-Identifier: GPL-2.0-only /* * TCP Vegas congestion control * * This is based on the congestion detection/avoidance scheme described in * Lawrence S. Brakmo and Larry L. Peterson. * "TCP Vegas: End to end congestion avoidance on a global internet." * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, * October 1995. Available from: * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps * * See http://www.cs.arizona.edu/xkernel/ for their implementation. * The main aspects that distinguish this implementation from the * Arizona Vegas implementation are: * o We do not change the loss detection or recovery mechanisms of * Linux in any way. Linux already recovers from losses quite well, * using fine-grained timers, NewReno, and FACK. * o To avoid the performance penalty imposed by increasing cwnd * only every-other RTT during slow start, we increase during * every RTT during slow start, just like Reno. * o Largely to allow continuous cwnd growth during slow start, * we use the rate at which ACKs come back as the "actual" * rate, rather than the rate at which data is sent. * o To speed convergence to the right rate, we set the cwnd * to achieve the right ("actual") rate when we exit slow start. * o To filter out the noise caused by delayed ACKs, we use the * minimum RTT sample observed during the last RTT to calculate * the actual rate. * o When the sender re-starts from idle, it waits until it has * received ACKs for an entire flight of new data before making * a cwnd adjustment decision. The original Vegas implementation * assumed senders never went idle. */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> #include "tcp_vegas.h" static int alpha = 2; static int beta = 4; static int gamma = 1; module_param(alpha, int, 0644); MODULE_PARM_DESC(alpha, "lower bound of packets in network"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "upper bound of packets in network"); module_param(gamma, int, 0644); MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); /* There are several situations when we must "re-start" Vegas: * * o when a connection is established * o after an RTO * o after fast recovery * o when we send a packet and there is no outstanding * unacknowledged data (restarting an idle connection) * * In these circumstances we cannot do a Vegas calculation at the * end of the first RTT, because any calculation we do is using * stale info -- both the saved cwnd and congestion feedback are * stale. * * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ static void vegas_enable(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); /* Begin taking Vegas samples next time we send something. */ vegas->doing_vegas_now = 1; /* Set the beginning of the next send window. */ vegas->beg_snd_nxt = tp->snd_nxt; vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } /* Stop taking Vegas samples for now. */ static inline void vegas_disable(struct sock *sk) { struct vegas *vegas = inet_csk_ca(sk); vegas->doing_vegas_now = 0; } void tcp_vegas_init(struct sock *sk) { struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; vegas_enable(sk); } EXPORT_SYMBOL_GPL(tcp_vegas_init); /* Do RTT sampling needed for Vegas. * Basically we: * o min-filter RTT samples from within an RTT to get the current * propagation delay + queuing delay (we are min-filtering to try to * avoid the effects of delayed ACKs) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) vegas->baseRTT = vrtt; /* Find the min RTT during the last RTT to find * the current prop. delay + queuing delay: */ vegas->minRTT = min(vegas->minRTT, vrtt); vegas->cntRTT++; } EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) vegas_enable(sk); else vegas_disable(sk); } EXPORT_SYMBOL_GPL(tcp_vegas_state); /* * If the connection is idle and we are restarting, * then we don't want to do any Vegas calculations * until we get fresh RTT samples. So when we * restart, we reset our Vegas state to a clean * slate. After we get acks for this flight of * packets, _then_ we can make Vegas calculations * again. */ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) tcp_vegas_init(sk); } EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { return min(tp->snd_ssthresh, tcp_snd_cwnd(tp)); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { tcp_reno_cong_avoid(sk, ack, acked); return; } if (after(ack, vegas->beg_snd_nxt)) { /* Do the Vegas once-per-RTT cwnd adjustment. */ /* Save the extent of the current window so we can use this * at the end of the next RTT. */ vegas->beg_snd_nxt = tp->snd_nxt; /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. * If we only had 2 samples total, * then that means we're getting only 1 ACK per RTT, which * means they're almost certainly delayed ACKs. * If we have 3 samples, we should be OK. */ if (vegas->cntRTT <= 2) { /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ tcp_reno_cong_avoid(sk, ack, acked); } else { u32 rtt, diff; u64 target_cwnd; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or * decrease cwnd, and by how much. */ /* Pluck out the RTT we are using for the Vegas * calculations. This is the min RTT seen during the * last RTT. Taking the min filters out the effects * of delayed ACKs, at the cost of noticing congestion * a bit later. */ rtt = vegas->minRTT; /* Calculate the cwnd we should have, if we weren't * going too fast. * * This is: * (actual rate in segments) * baseRTT */ target_cwnd = (u64)tcp_snd_cwnd(tp) * vegas->baseRTT; do_div(target_cwnd, rtt); /* Calculate the difference between the window we had, * and the window we would like to have. This quantity * is the "Diff" from the Arizona Vegas papers. */ diff = tcp_snd_cwnd(tp) * (rtt-vegas->baseRTT) / vegas->baseRTT; if (diff > gamma && tcp_in_slow_start(tp)) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ /* Set cwnd to match the actual rate * exactly: * cwnd = (actual rate) * baseRTT * Then we add 1 because the integer * truncation robs us of full link * utilization. */ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), (u32)target_cwnd + 1)); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { /* Congestion avoidance. */ /* Figure out where we would like cwnd * to be. */ if (diff > beta) { /* The old window was too fast, so * we slow down. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } else { /* Sending just as fast as we * should be. */ } } if (tcp_snd_cwnd(tp) < 2) tcp_snd_cwnd_set(tp, 2); else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp) tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp); tp->snd_ssthresh = tcp_current_ssthresh(sk); } /* Wipe the slate clean for the next RTT. */ vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } /* Use normal slow start */ else if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); } /* Extract info for Tcp socket info provided via netlink. */ size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = ca->doing_vegas_now; info->vegas.tcpv_rttcnt = ca->cntRTT; info->vegas.tcpv_rtt = ca->baseRTT; info->vegas.tcpv_minrtt = ca->minRTT; *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); } return 0; } EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_vegas_cong_avoid, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, .owner = THIS_MODULE, .name = "vegas", }; static int __init tcp_vegas_register(void) { BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_vegas); return 0; } static void __exit tcp_vegas_unregister(void) { tcp_unregister_congestion_control(&tcp_vegas); } module_init(tcp_vegas_register); module_exit(tcp_vegas_unregister); MODULE_AUTHOR("Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Vegas");
5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/clnt.c * * This file contains the high-level RPC interface. * It is modeled as a finite state machine to support both synchronous * and asynchronous requests. * * - RPC header generation and argument serialization. * - Credential refresh. * - TCP connect handling. * - Retry of operation when it is suspected the operation failed because * of uid squashing on the server, or when the credentials were stale * and need to be refreshed, or when a packet was damaged in transit. * This may be have to be moved to the VFS layer. * * Copyright (C) 1992,1993 Rick Sladkey <jrs@world.std.com> * Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kallsyms.h> #include <linux/mm.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/utsname.h> #include <linux/workqueue.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/un.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/bc_xprt.h> #include <trace/events/sunrpc.h> #include "sunrpc.h" #include "sysfs.h" #include "netns.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_CALL #endif static DECLARE_WAIT_QUEUE_HEAD(destroy_wait); static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); static void call_allocate(struct rpc_task *task); static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); static void call_status(struct rpc_task *task); static void call_transmit_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); static void call_refreshresult(struct rpc_task *task); static void call_connect(struct rpc_task *task); static void call_connect_status(struct rpc_task *task); static int rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr); static int rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr); static int rpc_ping(struct rpc_clnt *clnt); static int rpc_ping_noreply(struct rpc_clnt *clnt); static void rpc_check_timeout(struct rpc_task *task); static void rpc_register_client(struct rpc_clnt *clnt) { struct net *net = rpc_net_ns(clnt); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); spin_lock(&sn->rpc_client_lock); list_add(&clnt->cl_clients, &sn->all_clients); spin_unlock(&sn->rpc_client_lock); } static void rpc_unregister_client(struct rpc_clnt *clnt) { struct net *net = rpc_net_ns(clnt); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); spin_lock(&sn->rpc_client_lock); list_del(&clnt->cl_clients); spin_unlock(&sn->rpc_client_lock); } static void __rpc_clnt_remove_pipedir(struct rpc_clnt *clnt) { rpc_remove_client_dir(clnt); } static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt) { struct net *net = rpc_net_ns(clnt); struct super_block *pipefs_sb; pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { if (pipefs_sb == clnt->pipefs_sb) __rpc_clnt_remove_pipedir(clnt); rpc_put_sb_net(net); } } static int rpc_setup_pipedir_sb(struct super_block *sb, struct rpc_clnt *clnt) { static uint32_t clntid; const char *dir_name = clnt->cl_program->pipe_dir_name; char name[15]; struct dentry *dir; int err; dir = rpc_d_lookup_sb(sb, dir_name); if (dir == NULL) { pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name); return -ENOENT; } for (;;) { snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); name[sizeof(name) - 1] = '\0'; err = rpc_create_client_dir(dir, name, clnt); if (!err) break; if (err == -EEXIST) continue; printk(KERN_INFO "RPC: Couldn't create pipefs entry" " %s/%s, error %d\n", dir_name, name, err); break; } dput(dir); return err; } static int rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt) { clnt->pipefs_sb = pipefs_sb; if (clnt->cl_program->pipe_dir_name != NULL) { int err = rpc_setup_pipedir_sb(pipefs_sb, clnt); if (err && err != -ENOENT) return err; } return 0; } static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) { if (clnt->cl_program->pipe_dir_name == NULL) return 1; switch (event) { case RPC_PIPEFS_MOUNT: if (clnt->cl_pipedir_objects.pdh_dentry != NULL) return 1; if (refcount_read(&clnt->cl_count) == 0) return 1; break; case RPC_PIPEFS_UMOUNT: if (clnt->cl_pipedir_objects.pdh_dentry == NULL) return 1; break; } return 0; } static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { switch (event) { case RPC_PIPEFS_MOUNT: return rpc_setup_pipedir_sb(sb, clnt); case RPC_PIPEFS_UMOUNT: __rpc_clnt_remove_pipedir(clnt); break; default: printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event); return -ENOTSUPP; } return 0; } static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { int error = 0; for (;; clnt = clnt->cl_parent) { if (!rpc_clnt_skip_event(clnt, event)) error = __rpc_clnt_handle_event(clnt, event, sb); if (error || clnt == clnt->cl_parent) break; } return error; } static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; spin_lock(&sn->rpc_client_lock); list_for_each_entry(clnt, &sn->all_clients, cl_clients) { if (rpc_clnt_skip_event(clnt, event)) continue; spin_unlock(&sn->rpc_client_lock); return clnt; } spin_unlock(&sn->rpc_client_lock); return NULL; } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct super_block *sb = ptr; struct rpc_clnt *clnt; int error = 0; while ((clnt = rpc_get_client_for_event(sb->s_fs_info, event))) { error = __rpc_pipefs_event(clnt, event, sb); if (error) break; } return error; } static struct notifier_block rpc_clients_block = { .notifier_call = rpc_pipefs_event, .priority = SUNRPC_PIPEFS_RPC_PRIO, }; int rpc_clients_notifier_register(void) { return rpc_pipefs_notifier_register(&rpc_clients_block); } void rpc_clients_notifier_unregister(void) { return rpc_pipefs_notifier_unregister(&rpc_clients_block); } static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, struct rpc_xprt *xprt, const struct rpc_timeout *timeout) { struct rpc_xprt *old; spin_lock(&clnt->cl_lock); old = rcu_dereference_protected(clnt->cl_xprt, lockdep_is_held(&clnt->cl_lock)); clnt->cl_timeout = timeout; rcu_assign_pointer(clnt->cl_xprt, xprt); spin_unlock(&clnt->cl_lock); return old; } static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) { ssize_t copied; copied = strscpy(clnt->cl_nodename, nodename, sizeof(clnt->cl_nodename)); clnt->cl_nodelen = copied < 0 ? sizeof(clnt->cl_nodename) - 1 : copied; } static int rpc_client_register(struct rpc_clnt *clnt, rpc_authflavor_t pseudoflavor, const char *client_name) { struct rpc_auth_create_args auth_args = { .pseudoflavor = pseudoflavor, .target_name = client_name, }; struct rpc_auth *auth; struct net *net = rpc_net_ns(clnt); struct super_block *pipefs_sb; int err; rpc_clnt_debugfs_register(clnt); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { err = rpc_setup_pipedir(pipefs_sb, clnt); if (err) goto out; } rpc_register_client(clnt); if (pipefs_sb) rpc_put_sb_net(net); auth = rpcauth_create(&auth_args, clnt); if (IS_ERR(auth)) { dprintk("RPC: Couldn't create auth handle (flavor %u)\n", pseudoflavor); err = PTR_ERR(auth); goto err_auth; } return 0; err_auth: pipefs_sb = rpc_get_sb_net(net); rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); out: if (pipefs_sb) rpc_put_sb_net(net); rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); return err; } static DEFINE_IDA(rpc_clids); void rpc_cleanup_clids(void) { ida_destroy(&rpc_clids); } static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; clid = ida_alloc(&rpc_clids, GFP_KERNEL); if (clid < 0) return clid; clnt->cl_clid = clid; return 0; } static void rpc_free_clid(struct rpc_clnt *clnt) { ida_free(&rpc_clids, clnt->cl_clid); } static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, struct rpc_clnt *parent) { const struct rpc_program *program = args->program; const struct rpc_version *version; struct rpc_clnt *clnt = NULL; const struct rpc_timeout *timeout; const char *nodename = args->nodename; int err; err = rpciod_up(); if (err) goto out_no_rpciod; err = -EINVAL; if (args->version >= program->nrvers) goto out_err; version = program->version[args->version]; if (version == NULL) goto out_err; err = -ENOMEM; clnt = kzalloc(sizeof(*clnt), GFP_KERNEL); if (!clnt) goto out_err; clnt->cl_parent = parent ? : clnt; clnt->cl_xprtsec = args->xprtsec; err = rpc_alloc_clid(clnt); if (err) goto out_no_clid; clnt->cl_cred = get_cred(args->cred); clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_prog = args->prognumber ? : program->number; clnt->cl_vers = version->number; clnt->cl_stats = args->stats ? : program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); rpc_init_pipe_dir_head(&clnt->cl_pipedir_objects); err = -ENOMEM; if (clnt->cl_metrics == NULL) goto out_no_stats; clnt->cl_program = program; INIT_LIST_HEAD(&clnt->cl_tasks); spin_lock_init(&clnt->cl_lock); timeout = xprt->timeout; if (args->timeout != NULL) { memcpy(&clnt->cl_timeout_default, args->timeout, sizeof(clnt->cl_timeout_default)); timeout = &clnt->cl_timeout_default; } rpc_clnt_set_transport(clnt, xprt, timeout); xprt->main = true; xprt_iter_init(&clnt->cl_xpi, xps); xprt_switch_put(xps); clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); refcount_set(&clnt->cl_count, 1); if (nodename == NULL) nodename = utsname()->nodename; /* save the nodename */ rpc_clnt_set_nodename(clnt, nodename); rpc_sysfs_client_setup(clnt, xps, rpc_net_ns(clnt)); err = rpc_client_register(clnt, args->authflavor, args->client_name); if (err) goto out_no_path; if (parent) refcount_inc(&parent->cl_count); trace_rpc_clnt_new(clnt, xprt, args); return clnt; out_no_path: rpc_free_iostats(clnt->cl_metrics); out_no_stats: put_cred(clnt->cl_cred); rpc_free_clid(clnt); out_no_clid: kfree(clnt); out_err: rpciod_down(); out_no_rpciod: xprt_switch_put(xps); xprt_put(xprt); trace_rpc_clnt_new_err(program->name, args->servername, err); return ERR_PTR(err); } static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, struct rpc_xprt *xprt) { struct rpc_clnt *clnt = NULL; struct rpc_xprt_switch *xps; if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) { WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC)); xps = args->bc_xprt->xpt_bc_xps; xprt_switch_get(xps); } else { xps = xprt_switch_alloc(xprt, GFP_KERNEL); if (xps == NULL) { xprt_put(xprt); return ERR_PTR(-ENOMEM); } if (xprt->bc_xprt) { xprt_switch_get(xps); xprt->bc_xprt->xpt_bc_xps = xps; } } clnt = rpc_new_client(args, xps, xprt, NULL); if (IS_ERR(clnt)) return clnt; if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt); if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); } } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) { int err = rpc_ping_noreply(clnt); if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); } } clnt->cl_softrtry = 1; if (args->flags & (RPC_CLNT_CREATE_HARDRTRY|RPC_CLNT_CREATE_SOFTERR)) { clnt->cl_softrtry = 0; if (args->flags & RPC_CLNT_CREATE_SOFTERR) clnt->cl_softerr = 1; } if (args->flags & RPC_CLNT_CREATE_AUTOBIND) clnt->cl_autobind = 1; if (args->flags & RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT) clnt->cl_noretranstimeo = 1; if (args->flags & RPC_CLNT_CREATE_DISCRTRY) clnt->cl_discrtry = 1; if (!(args->flags & RPC_CLNT_CREATE_QUIET)) clnt->cl_chatty = 1; if (args->flags & RPC_CLNT_CREATE_NETUNREACH_FATAL) clnt->cl_netunreach_fatal = 1; return clnt; } /** * rpc_create - create an RPC client and transport with one call * @args: rpc_clnt create argument structure * * Creates and initializes an RPC transport and an RPC client. * * It can ping the server in order to determine if it is up, and to see if * it supports this program and version. RPC_CLNT_CREATE_NOPING disables * this behavior so asynchronous tasks can also use rpc_create. */ struct rpc_clnt *rpc_create(struct rpc_create_args *args) { struct rpc_xprt *xprt; struct xprt_create xprtargs = { .net = args->net, .ident = args->protocol, .srcaddr = args->saddress, .dstaddr = args->address, .addrlen = args->addrsize, .servername = args->servername, .bc_xprt = args->bc_xprt, .xprtsec = args->xprtsec, .connect_timeout = args->connect_timeout, .reconnect_timeout = args->reconnect_timeout, }; char servername[RPC_MAXNETNAMELEN]; struct rpc_clnt *clnt; int i; if (args->bc_xprt) { WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC)); xprt = args->bc_xprt->xpt_bc_xprt; if (xprt) { xprt_get(xprt); return rpc_create_xprt(args, xprt); } } if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS) xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS; if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT) xprtargs.flags |= XPRT_CREATE_NO_IDLE_TIMEOUT; /* * If the caller chooses not to specify a hostname, whip * up a string representation of the passed-in address. */ if (xprtargs.servername == NULL) { struct sockaddr_un *sun = (struct sockaddr_un *)args->address; struct sockaddr_in *sin = (struct sockaddr_in *)args->address; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)args->address; servername[0] = '\0'; switch (args->address->sa_family) { case AF_LOCAL: if (sun->sun_path[0]) snprintf(servername, sizeof(servername), "%s", sun->sun_path); else snprintf(servername, sizeof(servername), "@%s", sun->sun_path+1); break; case AF_INET: snprintf(servername, sizeof(servername), "%pI4", &sin->sin_addr.s_addr); break; case AF_INET6: snprintf(servername, sizeof(servername), "%pI6", &sin6->sin6_addr); break; default: /* caller wants default server name, but * address family isn't recognized. */ return ERR_PTR(-EINVAL); } xprtargs.servername = servername; } xprt = xprt_create_transport(&xprtargs); if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; /* * By default, kernel RPC client connects from a reserved port. * CAP_NET_BIND_SERVICE will not be set for unprivileged requesters, * but it is always enabled for rpciod, which handles the connect * operation. */ xprt->resvport = 1; if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) xprt->resvport = 0; xprt->reuseport = 0; if (args->flags & RPC_CLNT_CREATE_REUSEPORT) xprt->reuseport = 1; clnt = rpc_create_xprt(args, xprt); if (IS_ERR(clnt) || args->nconnect <= 1) return clnt; for (i = 0; i < args->nconnect - 1; i++) { if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0) break; } return clnt; } EXPORT_SYMBOL_GPL(rpc_create); /* * This function clones the RPC client structure. It allows us to share the * same transport while varying parameters such as the authentication * flavour. */ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, struct rpc_clnt *clnt) { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; struct rpc_clnt *new; int err; err = -ENOMEM; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); if (xprt == NULL || xps == NULL) { xprt_put(xprt); xprt_switch_put(xps); goto out_err; } args->servername = xprt->servername; args->nodename = clnt->cl_nodename; new = rpc_new_client(args, xps, xprt, clnt); if (IS_ERR(new)) return new; /* Turn off autobind on clones */ new->cl_autobind = 0; new->cl_softrtry = clnt->cl_softrtry; new->cl_softerr = clnt->cl_softerr; new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; new->cl_netunreach_fatal = clnt->cl_netunreach_fatal; new->cl_principal = clnt->cl_principal; new->cl_max_connect = clnt->cl_max_connect; return new; out_err: trace_rpc_clnt_clone_err(clnt, err); return ERR_PTR(err); } /** * rpc_clone_client - Clone an RPC client structure * * @clnt: RPC client whose parameters are copied * * Returns a fresh RPC client or an ERR_PTR. */ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) { struct rpc_create_args args = { .program = clnt->cl_program, .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = clnt->cl_auth->au_flavor, .cred = clnt->cl_cred, .stats = clnt->cl_stats, }; return __rpc_clone_client(&args, clnt); } EXPORT_SYMBOL_GPL(rpc_clone_client); /** * rpc_clone_client_set_auth - Clone an RPC client structure and set its auth * * @clnt: RPC client whose parameters are copied * @flavor: security flavor for new client * * Returns a fresh RPC client or an ERR_PTR. */ struct rpc_clnt * rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) { struct rpc_create_args args = { .program = clnt->cl_program, .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = flavor, .cred = clnt->cl_cred, .stats = clnt->cl_stats, }; return __rpc_clone_client(&args, clnt); } EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth); /** * rpc_switch_client_transport: switch the RPC transport on the fly * @clnt: pointer to a struct rpc_clnt * @args: pointer to the new transport arguments * @timeout: pointer to the new timeout parameters * * This function allows the caller to switch the RPC transport for the * rpc_clnt structure 'clnt' to allow it to connect to a mirrored NFS * server, for instance. It assumes that the caller has ensured that * there are no active RPC tasks by using some form of locking. * * Returns zero if "clnt" is now using the new xprt. Otherwise a * negative errno is returned, and "clnt" continues to use the old * xprt. */ int rpc_switch_client_transport(struct rpc_clnt *clnt, struct xprt_create *args, const struct rpc_timeout *timeout) { const struct rpc_timeout *old_timeo; rpc_authflavor_t pseudoflavor; struct rpc_xprt_switch *xps, *oldxps; struct rpc_xprt *xprt, *old; struct rpc_clnt *parent; int err; args->xprtsec = clnt->cl_xprtsec; xprt = xprt_create_transport(args); if (IS_ERR(xprt)) return PTR_ERR(xprt); xps = xprt_switch_alloc(xprt, GFP_KERNEL); if (xps == NULL) { xprt_put(xprt); return -ENOMEM; } pseudoflavor = clnt->cl_auth->au_flavor; old_timeo = clnt->cl_timeout; old = rpc_clnt_set_transport(clnt, xprt, timeout); oldxps = xprt_iter_xchg_switch(&clnt->cl_xpi, xps); rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); /* * A new transport was created. "clnt" therefore * becomes the root of a new cl_parent tree. clnt's * children, if it has any, still point to the old xprt. */ parent = clnt->cl_parent; clnt->cl_parent = clnt; /* * The old rpc_auth cache cannot be re-used. GSS * contexts in particular are between a single * client and server. */ err = rpc_client_register(clnt, pseudoflavor, NULL); if (err) goto out_revert; synchronize_rcu(); if (parent != clnt) rpc_release_client(parent); xprt_switch_put(oldxps); xprt_put(old); trace_rpc_clnt_replace_xprt(clnt); return 0; out_revert: xps = xprt_iter_xchg_switch(&clnt->cl_xpi, oldxps); rpc_clnt_set_transport(clnt, old, old_timeo); clnt->cl_parent = parent; rpc_client_register(clnt, pseudoflavor, NULL); xprt_switch_put(xps); xprt_put(xprt); trace_rpc_clnt_replace_xprt_err(clnt); return err; } EXPORT_SYMBOL_GPL(rpc_switch_client_transport); static struct rpc_xprt_switch *rpc_clnt_xprt_switch_get(struct rpc_clnt *clnt) { struct rpc_xprt_switch *xps; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); return xps; } static int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) { struct rpc_xprt_switch *xps; xps = rpc_clnt_xprt_switch_get(clnt); if (xps == NULL) return -EAGAIN; func(xpi, xps); xprt_switch_put(xps); return 0; } static int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) { return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall); } static int rpc_clnt_xprt_iter_offline_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) { return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listoffline); } /** * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports * @clnt: pointer to client * @fn: function to apply * @data: void pointer to function data * * Iterates through the list of RPC transports currently attached to the * client and applies the function fn(clnt, xprt, data). * * On error, the iteration stops, and the function returns the error value. */ int rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt, int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *), void *data) { struct rpc_xprt_iter xpi; int ret; ret = rpc_clnt_xprt_iter_init(clnt, &xpi); if (ret) return ret; for (;;) { struct rpc_xprt *xprt = xprt_iter_get_next(&xpi); if (!xprt) break; ret = fn(clnt, xprt, data); xprt_put(xprt); if (ret < 0) break; } xprt_iter_destroy(&xpi); return ret; } EXPORT_SYMBOL_GPL(rpc_clnt_iterate_for_each_xprt); /* * Kill all tasks for the given client. * XXX: kill their descendants as well? */ void rpc_killall_tasks(struct rpc_clnt *clnt) { struct rpc_task *rovr; if (list_empty(&clnt->cl_tasks)) return; /* * Spin lock all_tasks to prevent changes... */ trace_rpc_clnt_killall(clnt); spin_lock(&clnt->cl_lock); list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) rpc_signal_task(rovr); spin_unlock(&clnt->cl_lock); } EXPORT_SYMBOL_GPL(rpc_killall_tasks); /** * rpc_cancel_tasks - try to cancel a set of RPC tasks * @clnt: Pointer to RPC client * @error: RPC task error value to set * @fnmatch: Pointer to selector function * @data: User data * * Uses @fnmatch to define a set of RPC tasks that are to be cancelled. * The argument @error must be a negative error value. */ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, bool (*fnmatch)(const struct rpc_task *, const void *), const void *data) { struct rpc_task *task; unsigned long count = 0; if (list_empty(&clnt->cl_tasks)) return 0; /* * Spin lock all_tasks to prevent changes... */ spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) { if (!RPC_IS_ACTIVATED(task)) continue; if (!fnmatch(task, data)) continue; rpc_task_try_cancel(task, error); count++; } spin_unlock(&clnt->cl_lock); return count; } EXPORT_SYMBOL_GPL(rpc_cancel_tasks); static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *dummy) { if (xprt_connected(xprt)) xprt_force_disconnect(xprt); return 0; } void rpc_clnt_disconnect(struct rpc_clnt *clnt) { rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_disconnect_xprt, NULL); } EXPORT_SYMBOL_GPL(rpc_clnt_disconnect); /* * Properly shut down an RPC client, terminating all outstanding * requests. */ void rpc_shutdown_client(struct rpc_clnt *clnt) { might_sleep(); trace_rpc_clnt_shutdown(clnt); clnt->cl_shutdown = 1; while (!list_empty(&clnt->cl_tasks)) { rpc_killall_tasks(clnt); wait_event_timeout(destroy_wait, list_empty(&clnt->cl_tasks), 1*HZ); } /* wait for tasks still in workqueue or waitqueue */ wait_event_timeout(destroy_wait, atomic_read(&clnt->cl_task_count) == 0, 1 * HZ); rpc_release_client(clnt); } EXPORT_SYMBOL_GPL(rpc_shutdown_client); /* * Free an RPC client */ static void rpc_free_client_work(struct work_struct *work) { struct rpc_clnt *clnt = container_of(work, struct rpc_clnt, cl_work); trace_rpc_clnt_free(clnt); /* These might block on processes that might allocate memory, * so they cannot be called in rpciod, so they are handled separately * here. */ rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); rpc_free_clid(clnt); rpc_clnt_remove_pipedir(clnt); xprt_put(rcu_dereference_raw(clnt->cl_xprt)); kfree(clnt); rpciod_down(); } static struct rpc_clnt * rpc_free_client(struct rpc_clnt *clnt) { struct rpc_clnt *parent = NULL; trace_rpc_clnt_release(clnt); if (clnt->cl_parent != clnt) parent = clnt->cl_parent; rpc_unregister_client(clnt); rpc_free_iostats(clnt->cl_metrics); clnt->cl_metrics = NULL; xprt_iter_destroy(&clnt->cl_xpi); put_cred(clnt->cl_cred); INIT_WORK(&clnt->cl_work, rpc_free_client_work); schedule_work(&clnt->cl_work); return parent; } /* * Free an RPC client */ static struct rpc_clnt * rpc_free_auth(struct rpc_clnt *clnt) { /* * Note: RPCSEC_GSS may need to send NULL RPC calls in order to * release remaining GSS contexts. This mechanism ensures * that it can do so safely. */ if (clnt->cl_auth != NULL) { rpcauth_release(clnt->cl_auth); clnt->cl_auth = NULL; } if (refcount_dec_and_test(&clnt->cl_count)) return rpc_free_client(clnt); return NULL; } /* * Release reference to the RPC client */ void rpc_release_client(struct rpc_clnt *clnt) { do { if (list_empty(&clnt->cl_tasks)) wake_up(&destroy_wait); if (refcount_dec_not_one(&clnt->cl_count)) break; clnt = rpc_free_auth(clnt); } while (clnt != NULL); } EXPORT_SYMBOL_GPL(rpc_release_client); /** * rpc_bind_new_program - bind a new RPC program to an existing client * @old: old rpc_client * @program: rpc program to set * @vers: rpc program version * * Clones the rpc client and sets up a new RPC program. This is mainly * of use for enabling different RPC programs to share the same transport. * The Sun NFSv2/v3 ACL protocol can do this. */ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, const struct rpc_program *program, u32 vers) { struct rpc_create_args args = { .program = program, .prognumber = program->number, .version = vers, .authflavor = old->cl_auth->au_flavor, .cred = old->cl_cred, .stats = old->cl_stats, .timeout = old->cl_timeout, }; struct rpc_clnt *clnt; int err; clnt = __rpc_clone_client(&args, old); if (IS_ERR(clnt)) goto out; err = rpc_ping(clnt); if (err != 0) { rpc_shutdown_client(clnt); clnt = ERR_PTR(err); } out: return clnt; } EXPORT_SYMBOL_GPL(rpc_bind_new_program); struct rpc_xprt * rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; if (!xprt) return NULL; rcu_read_lock(); xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); atomic_long_inc(&xps->xps_queuelen); rcu_read_unlock(); atomic_long_inc(&xprt->queuelen); return xprt; } static void rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; atomic_long_dec(&xprt->queuelen); rcu_read_lock(); xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); atomic_long_dec(&xps->xps_queuelen); rcu_read_unlock(); xprt_put(xprt); } void rpc_task_release_transport(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; if (xprt) { task->tk_xprt = NULL; if (task->tk_client) rpc_task_release_xprt(task->tk_client, xprt); else xprt_put(xprt); } } EXPORT_SYMBOL_GPL(rpc_task_release_transport); void rpc_task_release_client(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; rpc_task_release_transport(task); if (clnt != NULL) { /* Remove from client task list */ spin_lock(&clnt->cl_lock); list_del(&task->tk_task); spin_unlock(&clnt->cl_lock); task->tk_client = NULL; atomic_dec(&clnt->cl_task_count); rpc_release_client(clnt); } } static struct rpc_xprt * rpc_task_get_first_xprt(struct rpc_clnt *clnt) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); rcu_read_unlock(); return rpc_task_get_xprt(clnt, xprt); } static struct rpc_xprt * rpc_task_get_next_xprt(struct rpc_clnt *clnt) { return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi)); } static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { if (task->tk_xprt) { if (!(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) && (task->tk_flags & RPC_TASK_MOVEABLE))) return; xprt_release(task); xprt_put(task->tk_xprt); } if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) task->tk_xprt = rpc_task_get_first_xprt(clnt); else task->tk_xprt = rpc_task_get_next_xprt(clnt); } static void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { rpc_task_set_transport(task, clnt); task->tk_client = clnt; refcount_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; if (clnt->cl_softerr) task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; if (clnt->cl_netunreach_fatal) task->tk_flags |= RPC_TASK_NETUNREACH_FATAL; atomic_inc(&clnt->cl_task_count); } static void rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) { if (msg != NULL) { task->tk_msg.rpc_proc = msg->rpc_proc; task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; task->tk_msg.rpc_cred = msg->rpc_cred; if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) get_cred(task->tk_msg.rpc_cred); } } /* * Default callback for async RPC calls */ static void rpc_default_callback(struct rpc_task *task, void *data) { } static const struct rpc_call_ops rpc_default_ops = { .rpc_call_done = rpc_default_callback, }; /** * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it * @task_setup_data: pointer to task initialisation data */ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) { struct rpc_task *task; task = rpc_new_task(task_setup_data); if (IS_ERR(task)) return task; if (!RPC_IS_ASYNC(task)) task->tk_flags |= RPC_TASK_CRED_NOREF; rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); if (task->tk_action == NULL) rpc_call_start(task); atomic_inc(&task->tk_count); rpc_execute(task); return task; } EXPORT_SYMBOL_GPL(rpc_run_task); /** * rpc_call_sync - Perform a synchronous RPC call * @clnt: pointer to RPC client * @msg: RPC call parameters * @flags: RPC call flags */ int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_message = msg, .callback_ops = &rpc_default_ops, .flags = flags, }; int status; WARN_ON_ONCE(flags & RPC_TASK_ASYNC); if (flags & RPC_TASK_ASYNC) { rpc_release_calldata(task_setup_data.callback_ops, task_setup_data.callback_data); return -EINVAL; } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); status = task->tk_status; rpc_put_task(task); return status; } EXPORT_SYMBOL_GPL(rpc_call_sync); /** * rpc_call_async - Perform an asynchronous RPC call * @clnt: pointer to RPC client * @msg: RPC call parameters * @flags: RPC call flags * @tk_ops: RPC call ops * @data: user call data */ int rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, const struct rpc_call_ops *tk_ops, void *data) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_message = msg, .callback_ops = tk_ops, .callback_data = data, .flags = flags|RPC_TASK_ASYNC, }; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); return 0; } EXPORT_SYMBOL_GPL(rpc_call_async); #if defined(CONFIG_SUNRPC_BACKCHANNEL) static void call_bc_encode(struct rpc_task *task); /** * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run * rpc_execute against it * @req: RPC request * @timeout: timeout values to use for this task */ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, struct rpc_timeout *timeout) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { .callback_ops = &rpc_default_ops, .flags = RPC_TASK_SOFTCONN | RPC_TASK_NO_RETRANS_TIMEOUT, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); /* * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); if (IS_ERR(task)) { xprt_free_bc_request(req); return task; } xprt_init_bc_request(req, task, timeout); task->tk_action = call_bc_encode; atomic_inc(&task->tk_count); WARN_ON_ONCE(atomic_read(&task->tk_count) != 2); rpc_execute(task); dprintk("RPC: rpc_run_bc_task: task= %p\n", task); return task; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** * rpc_prepare_reply_pages - Prepare to receive a reply data payload into pages * @req: RPC request to prepare * @pages: vector of struct page pointers * @base: offset in first page where receive should start, in bytes * @len: expected size of the upper layer data payload, in bytes * @hdrsize: expected size of upper layer reply header, in XDR words * */ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize) { hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); } EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages); void rpc_call_start(struct rpc_task *task) { task->tk_action = call_start; } EXPORT_SYMBOL_GPL(rpc_call_start); /** * rpc_peeraddr - extract remote peer address from clnt's xprt * @clnt: RPC client structure * @buf: target buffer * @bufsize: length of target buffer * * Returns the number of bytes that are actually in the stored address. */ size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize) { size_t bytes; struct rpc_xprt *xprt; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); bytes = xprt->addrlen; if (bytes > bufsize) bytes = bufsize; memcpy(buf, &xprt->addr, bytes); rcu_read_unlock(); return bytes; } EXPORT_SYMBOL_GPL(rpc_peeraddr); /** * rpc_peeraddr2str - return remote peer address in printable format * @clnt: RPC client structure * @format: address format * * NB: the lifetime of the memory referenced by the returned pointer is * the same as the rpc_xprt itself. As long as the caller uses this * pointer, it must hold the RCU read lock. */ const char *rpc_peeraddr2str(struct rpc_clnt *clnt, enum rpc_display_format_t format) { struct rpc_xprt *xprt; xprt = rcu_dereference(clnt->cl_xprt); if (xprt->address_strings[format] != NULL) return xprt->address_strings[format]; else return "unprintable"; } EXPORT_SYMBOL_GPL(rpc_peeraddr2str); static const struct sockaddr_in rpc_inaddr_loopback = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; static const struct sockaddr_in6 rpc_in6addr_loopback = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, }; /* * Try a getsockname() on a connected datagram socket. Using a * connected datagram socket prevents leaving a socket in TIME_WAIT. * This conserves the ephemeral port number space. * * Returns zero and fills in "buf" if successful; otherwise, a * negative errno is returned. */ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, struct sockaddr *buf) { struct socket *sock; int err; err = __sock_create(net, sap->sa_family, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); if (err < 0) { dprintk("RPC: can't create UDP socket (%d)\n", err); goto out; } switch (sap->sa_family) { case AF_INET: err = kernel_bind(sock, (struct sockaddr *)&rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: err = kernel_bind(sock, (struct sockaddr *)&rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: err = -EAFNOSUPPORT; goto out_release; } if (err < 0) { dprintk("RPC: can't bind UDP socket (%d)\n", err); goto out_release; } err = kernel_connect(sock, sap, salen, 0); if (err < 0) { dprintk("RPC: can't connect UDP socket (%d)\n", err); goto out_release; } err = kernel_getsockname(sock, buf); if (err < 0) { dprintk("RPC: getsockname failed (%d)\n", err); goto out_release; } err = 0; if (buf->sa_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)buf; sin6->sin6_scope_id = 0; } dprintk("RPC: %s succeeded\n", __func__); out_release: sock_release(sock); out: return err; } /* * Scraping a connected socket failed, so we don't have a useable * local address. Fallback: generate an address that will prevent * the server from calling us back. * * Returns zero and fills in "buf" if successful; otherwise, a * negative errno is returned. */ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen) { switch (family) { case AF_INET: if (buflen < sizeof(rpc_inaddr_loopback)) return -EINVAL; memcpy(buf, &rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: if (buflen < sizeof(rpc_in6addr_loopback)) return -EINVAL; memcpy(buf, &rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: dprintk("RPC: %s: address family not supported\n", __func__); return -EAFNOSUPPORT; } dprintk("RPC: %s: succeeded\n", __func__); return 0; } /** * rpc_localaddr - discover local endpoint address for an RPC client * @clnt: RPC client structure * @buf: target buffer * @buflen: size of target buffer, in bytes * * Returns zero and fills in "buf" and "buflen" if successful; * otherwise, a negative errno is returned. * * This works even if the underlying transport is not currently connected, * or if the upper layer never previously provided a source address. * * The result of this function call is transient: multiple calls in * succession may give different results, depending on how local * networking configuration changes over time. */ int rpc_localaddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t buflen) { struct sockaddr_storage address; struct sockaddr *sap = (struct sockaddr *)&address; struct rpc_xprt *xprt; struct net *net; size_t salen; int err; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); salen = xprt->addrlen; memcpy(sap, &xprt->addr, salen); net = get_net(xprt->xprt_net); rcu_read_unlock(); rpc_set_port(sap, 0); err = rpc_sockname(net, sap, salen, buf); put_net(net); if (err != 0) /* Couldn't discover local address, return ANYADDR */ return rpc_anyaddr(sap->sa_family, buf, buflen); return 0; } EXPORT_SYMBOL_GPL(rpc_localaddr); void rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); if (xprt->ops->set_buffer_size) xprt->ops->set_buffer_size(xprt, sndsize, rcvsize); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rpc_setbufsize); /** * rpc_net_ns - Get the network namespace for this RPC client * @clnt: RPC client to query * */ struct net *rpc_net_ns(struct rpc_clnt *clnt) { struct net *ret; rcu_read_lock(); ret = rcu_dereference(clnt->cl_xprt)->xprt_net; rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_net_ns); /** * rpc_max_payload - Get maximum payload size for a transport, in bytes * @clnt: RPC client to query * * For stream transports, this is one RPC record fragment (see RFC * 1831), as we don't support multi-record requests yet. For datagram * transports, this is the size of an IP packet minus the IP, UDP, and * RPC header sizes. */ size_t rpc_max_payload(struct rpc_clnt *clnt) { size_t ret; rcu_read_lock(); ret = rcu_dereference(clnt->cl_xprt)->max_payload; rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_max_payload); /** * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes * @clnt: RPC client to query */ size_t rpc_max_bc_payload(struct rpc_clnt *clnt) { struct rpc_xprt *xprt; size_t ret; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); ret = xprt->ops->bc_maxpayload(xprt); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_max_bc_payload); unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt) { struct rpc_xprt *xprt; unsigned int ret; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); ret = xprt->ops->bc_num_slots(xprt); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_num_bc_slots); /** * rpc_force_rebind - force transport to check that remote port is unchanged * @clnt: client to rebind * */ void rpc_force_rebind(struct rpc_clnt *clnt) { if (clnt->cl_autobind) { rcu_read_lock(); xprt_clear_bound(rcu_dereference(clnt->cl_xprt)); rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(rpc_force_rebind); static int __rpc_restart_call(struct rpc_task *task, void (*action)(struct rpc_task *)) { task->tk_status = 0; task->tk_rpc_status = 0; task->tk_action = action; return 1; } /* * Restart an (async) RPC call. Usually called from within the * exit handler. */ int rpc_restart_call(struct rpc_task *task) { return __rpc_restart_call(task, call_start); } EXPORT_SYMBOL_GPL(rpc_restart_call); /* * Restart an (async) RPC call from the call_prepare state. * Usually called from within the exit handler. */ int rpc_restart_call_prepare(struct rpc_task *task) { if (task->tk_ops->rpc_call_prepare != NULL) return __rpc_restart_call(task, rpc_prepare_task); return rpc_restart_call(task); } EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); const char *rpc_proc_name(const struct rpc_task *task) { const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; if (proc) { if (proc->p_name) return proc->p_name; else return "NULL"; } else return "no proc"; } static void __rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) { trace_rpc_call_rpcerror(task, tk_status, rpc_status); rpc_task_set_rpc_status(task, rpc_status); rpc_exit(task, tk_status); } static void rpc_call_rpcerror(struct rpc_task *task, int status) { __rpc_call_rpcerror(task, status, status); } /* * 0. Initial state * * Other FSM states can be visited zero or more times, but * this state is visited exactly once for each RPC. */ static void call_start(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; int idx = task->tk_msg.rpc_proc->p_statidx; trace_rpc_request(task); if (task->tk_client->cl_shutdown) { rpc_call_rpcerror(task, -EIO); return; } /* Increment call count (version might not be valid for ping) */ if (clnt->cl_program->version[clnt->cl_vers]) clnt->cl_program->version[clnt->cl_vers]->counts[idx]++; clnt->cl_stats->rpccnt++; task->tk_action = call_reserve; rpc_task_set_transport(task, clnt); } /* * 1. Reserve an RPC call slot */ static void call_reserve(struct rpc_task *task) { task->tk_status = 0; task->tk_action = call_reserveresult; xprt_reserve(task); } static void call_retry_reserve(struct rpc_task *task); /* * 1b. Grok the result of xprt_reserve() */ static void call_reserveresult(struct rpc_task *task) { int status = task->tk_status; /* * After a call to xprt_reserve(), we must have either * a request slot or else an error status. */ task->tk_status = 0; if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; /* Add to the client's list of all tasks */ spin_lock(&task->tk_client->cl_lock); if (list_empty(&task->tk_task)) list_add_tail(&task->tk_task, &task->tk_client->cl_tasks); spin_unlock(&task->tk_client->cl_lock); return; } rpc_call_rpcerror(task, -EIO); return; } switch (status) { case -ENOMEM: rpc_delay(task, HZ >> 2); fallthrough; case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; default: rpc_call_rpcerror(task, status); } } /* * 1c. Retry reserving an RPC call slot */ static void call_retry_reserve(struct rpc_task *task) { task->tk_status = 0; task->tk_action = call_reserveresult; xprt_retry_reserve(task); } /* * 2. Bind and/or refresh the credentials */ static void call_refresh(struct rpc_task *task) { task->tk_action = call_refreshresult; task->tk_status = 0; task->tk_client->cl_stats->rpcauthrefresh++; rpcauth_refreshcred(task); } /* * 2a. Process the results of a credential refresh */ static void call_refreshresult(struct rpc_task *task) { int status = task->tk_status; task->tk_status = 0; task->tk_action = call_refresh; switch (status) { case 0: if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; return; } /* Use rate-limiting and a max number of retries if refresh * had status 0 but failed to update the cred. */ fallthrough; case -ETIMEDOUT: rpc_delay(task, 3*HZ); fallthrough; case -EAGAIN: status = -EACCES; if (!task->tk_cred_retry) break; task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); return; case -EKEYEXPIRED: break; case -ENOMEM: rpc_delay(task, HZ >> 4); return; } trace_rpc_refresh_status(task); rpc_call_rpcerror(task, status); } /* * 2b. Allocate the buffer. For details, see sched.c:rpc_malloc. * (Note: buffer memory is freed in xprt_release). */ static void call_allocate(struct rpc_task *task) { const struct rpc_auth *auth = task->tk_rqstp->rq_cred->cr_auth; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; int status; task->tk_status = 0; task->tk_action = call_encode; if (req->rq_buffer) return; /* * Calculate the size (in quads) of the RPC call * and reply headers, and convert both values * to byte sizes. */ req->rq_callsize = RPC_CALLHDRSIZE + (auth->au_cslack << 1) + proc->p_arglen; req->rq_callsize <<= 2; /* * Note: the reply buffer must at minimum allocate enough space * for the 'struct accepted_reply' from RFC5531. */ req->rq_rcvsize = RPC_REPHDRSIZE + auth->au_rslack + \ max_t(size_t, proc->p_replen, 2); req->rq_rcvsize <<= 2; status = xprt->ops->buf_alloc(task); trace_rpc_buf_alloc(task, status); if (status == 0) return; if (status != -ENOMEM) { rpc_call_rpcerror(task, status); return; } if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) { task->tk_action = call_allocate; rpc_delay(task, HZ>>4); return; } rpc_call_rpcerror(task, -ERESTARTSYS); } static int rpc_task_need_encode(struct rpc_task *task) { return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 && (!(task->tk_flags & RPC_TASK_SENT) || !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) || xprt_request_need_retransmit(task)); } static void rpc_xdr_encode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; xdr_buf_init(&req->rq_snd_buf, req->rq_buffer, req->rq_callsize); xdr_buf_init(&req->rq_rcv_buf, req->rq_rbuffer, req->rq_rcvsize); req->rq_reply_bytes_recvd = 0; req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); if (rpc_encode_header(task, &xdr)) return; task->tk_status = rpcauth_wrap_req(task, &xdr); } /* * 3. Encode arguments of an RPC call */ static void call_encode(struct rpc_task *task) { if (!rpc_task_need_encode(task)) goto out; /* Dequeue task from the receive queue while we're encoding */ xprt_request_dequeue_xprt(task); /* Encode here so that rpcsec_gss can use correct sequence number. */ rpc_xdr_encode(task); /* Add task to reply queue before transmission to avoid races */ if (task->tk_status == 0 && rpc_reply_expected(task)) task->tk_status = xprt_request_enqueue_receive(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) { /* Was the error nonfatal? */ switch (task->tk_status) { case -EAGAIN: case -ENOMEM: rpc_delay(task, HZ >> 4); break; case -EKEYEXPIRED: if (!task->tk_cred_retry) { rpc_call_rpcerror(task, task->tk_status); } else { task->tk_action = call_refresh; task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); } break; default: rpc_call_rpcerror(task, task->tk_status); } return; } xprt_request_enqueue_transmit(task); out: task->tk_action = call_transmit; /* Check that the connection is OK */ if (!xprt_bound(task->tk_xprt)) task->tk_action = call_bind; else if (!xprt_connected(task->tk_xprt)) task->tk_action = call_connect; } /* * Helpers to check if the task was already transmitted, and * to take action when that is the case. */ static bool rpc_task_transmitted(struct rpc_task *task) { return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); } static void rpc_task_handle_transmitted(struct rpc_task *task) { xprt_end_transmit(task); task->tk_action = call_transmit_status; } /* * 4. Get the server port number if not yet set */ static void call_bind(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } if (xprt_bound(xprt)) { task->tk_action = call_connect; return; } task->tk_action = call_bind_status; if (!xprt_prepare_transmit(task)) return; xprt->ops->rpcbind(task); } /* * 4a. Sort out bind result */ static void call_bind_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; int status = -EIO; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } if (task->tk_status >= 0) goto out_next; if (xprt_bound(xprt)) { task->tk_status = 0; goto out_next; } switch (task->tk_status) { case -ENOMEM: rpc_delay(task, HZ >> 2); goto retry_timeout; case -EACCES: trace_rpcb_prog_unavail_err(task); /* fail immediately if this is an RPC ping */ if (task->tk_msg.rpc_proc->p_proc == 0) { status = -EOPNOTSUPP; break; } rpc_delay(task, 3*HZ); goto retry_timeout; case -ENOBUFS: rpc_delay(task, HZ >> 2); goto retry_timeout; case -EAGAIN: goto retry_timeout; case -ETIMEDOUT: trace_rpcb_timeout_err(task); goto retry_timeout; case -EPFNOSUPPORT: /* server doesn't support any rpcbind version we know of */ trace_rpcb_bind_version_err(task); break; case -EPROTONOSUPPORT: trace_rpcb_bind_version_err(task); goto retry_timeout; case -ENETDOWN: case -ENETUNREACH: if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) break; fallthrough; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: case -EHOSTUNREACH: case -EPIPE: trace_rpcb_unreachable_err(task); if (!RPC_IS_SOFTCONN(task)) { rpc_delay(task, 5*HZ); goto retry_timeout; } status = task->tk_status; break; default: trace_rpcb_unrecognized_err(task); } rpc_call_rpcerror(task, status); return; out_next: task->tk_action = call_connect; return; retry_timeout: task->tk_status = 0; task->tk_action = call_bind; rpc_check_timeout(task); } /* * 4b. Connect to the RPC server */ static void call_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } if (xprt_connected(xprt)) { task->tk_action = call_transmit; return; } task->tk_action = call_connect_status; if (task->tk_status < 0) return; if (task->tk_flags & RPC_TASK_NOCONNECT) { rpc_call_rpcerror(task, -ENOTCONN); return; } if (!xprt_prepare_transmit(task)) return; xprt_connect(task); } /* * 4c. Sort out connect result */ static void call_connect_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } trace_rpc_connect_status(task); if (task->tk_status == 0) { clnt->cl_stats->netreconn++; goto out_next; } if (xprt_connected(xprt)) { task->tk_status = 0; goto out_next; } task->tk_status = 0; switch (status) { case -ENETDOWN: case -ENETUNREACH: if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) break; fallthrough; case -ECONNREFUSED: case -ECONNRESET: /* A positive refusal suggests a rebind is needed. */ if (clnt->cl_autobind) { rpc_force_rebind(clnt); if (RPC_IS_SOFTCONN(task)) break; goto out_retry; } fallthrough; case -ECONNABORTED: case -EHOSTUNREACH: case -EPIPE: case -EPROTO: xprt_conditional_disconnect(task->tk_rqstp->rq_xprt, task->tk_rqstp->rq_connect_cookie); if (RPC_IS_SOFTCONN(task)) break; /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); fallthrough; case -EADDRINUSE: case -ENOTCONN: case -EAGAIN: case -ETIMEDOUT: if (!(task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) && (task->tk_flags & RPC_TASK_MOVEABLE) && test_bit(XPRT_REMOVE, &xprt->state)) { struct rpc_xprt *saved = task->tk_xprt; struct rpc_xprt_switch *xps; xps = rpc_clnt_xprt_switch_get(clnt); if (xps->xps_nxprts > 1) { long value; xprt_release(task); value = atomic_long_dec_return(&xprt->queuelen); if (value == 0) rpc_xprt_switch_remove_xprt(xps, saved, true); xprt_put(saved); task->tk_xprt = NULL; task->tk_action = call_start; } xprt_switch_put(xps); if (!task->tk_xprt) goto out; } goto out_retry; case -ENOBUFS: rpc_delay(task, HZ >> 2); goto out_retry; } rpc_call_rpcerror(task, status); return; out_next: task->tk_action = call_transmit; return; out_retry: /* Check for timeouts before looping back to call_bind */ task->tk_action = call_bind; out: rpc_check_timeout(task); } /* * 5. Transmit the RPC request, and wait for reply */ static void call_transmit(struct rpc_task *task) { if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } task->tk_action = call_transmit_status; if (!xprt_prepare_transmit(task)) return; task->tk_status = 0; if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { if (!xprt_connected(task->tk_xprt)) { task->tk_status = -ENOTCONN; return; } xprt_transmit(task); } xprt_end_transmit(task); } /* * 5a. Handle cleanup after a transmission */ static void call_transmit_status(struct rpc_task *task) { task->tk_action = call_status; /* * Common case: success. Force the compiler to put this * test first. */ if (rpc_task_transmitted(task)) { task->tk_status = 0; xprt_request_wait_receive(task); return; } switch (task->tk_status) { default: break; case -EBADMSG: task->tk_status = 0; task->tk_action = call_encode; break; /* * Special cases: if we've been waiting on the * socket's write_space() callback, or if the * socket just returned a connection error, * then hold onto the transport lock. */ case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); fallthrough; case -EBADSLT: case -EAGAIN: task->tk_action = call_transmit; task->tk_status = 0; break; case -EHOSTDOWN: case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: break; case -ECONNREFUSED: if (RPC_IS_SOFTCONN(task)) { if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); rpc_call_rpcerror(task, task->tk_status); return; } fallthrough; case -ECONNRESET: case -ECONNABORTED: case -EADDRINUSE: case -ENOTCONN: case -EPIPE: task->tk_action = call_bind; task->tk_status = 0; break; } rpc_check_timeout(task); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) static void call_bc_transmit(struct rpc_task *task); static void call_bc_transmit_status(struct rpc_task *task); static void call_bc_encode(struct rpc_task *task) { xprt_request_enqueue_transmit(task); task->tk_action = call_bc_transmit; } /* * 5b. Send the backchannel RPC reply. On error, drop the reply. In * addition, disconnect on connectivity errors. */ static void call_bc_transmit(struct rpc_task *task) { task->tk_action = call_bc_transmit_status; if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { if (!xprt_prepare_transmit(task)) return; task->tk_status = 0; xprt_transmit(task); } xprt_end_transmit(task); } static void call_bc_transmit_status(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (rpc_task_transmitted(task)) task->tk_status = 0; switch (task->tk_status) { case 0: /* Success */ case -ENETDOWN: case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -ECONNRESET: case -ECONNREFUSED: case -EADDRINUSE: case -ENOTCONN: case -EPIPE: break; case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); fallthrough; case -EBADSLT: case -EAGAIN: task->tk_status = 0; task->tk_action = call_bc_transmit; return; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the * forechannel reestablish the connection. The server will * have to retransmit the backchannel request and we'll * reprocess it. Since these ops are idempotent, there's no * need to cache our reply at this time. */ printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; default: /* * We were unable to reply and will have to drop the * request. The server should reconnect and retransmit. */ printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); break; } task->tk_action = rpc_exit_task; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /* * 6. Sort out the RPC call status */ static void call_status(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; int status; if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); status = task->tk_status; if (status >= 0) { task->tk_action = call_decode; return; } trace_rpc_call_status(task); task->tk_status = 0; switch(status) { case -ENETDOWN: case -ENETUNREACH: if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) goto out_exit; fallthrough; case -EHOSTDOWN: case -EHOSTUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) goto out_exit; /* * Delay any retries for 3 seconds, then handle as if it * were a timeout. */ rpc_delay(task, 3*HZ); fallthrough; case -ETIMEDOUT: break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: rpc_force_rebind(clnt); break; case -EADDRINUSE: rpc_delay(task, 3*HZ); fallthrough; case -EPIPE: case -EAGAIN: break; case -ENFILE: case -ENOBUFS: case -ENOMEM: rpc_delay(task, HZ>>2); break; case -EIO: /* shutdown or soft timeout */ goto out_exit; default: if (clnt->cl_chatty) printk("%s: RPC call returned error %d\n", clnt->cl_program->name, -status); goto out_exit; } task->tk_action = call_encode; rpc_check_timeout(task); return; out_exit: rpc_call_rpcerror(task, status); } static bool rpc_check_connected(const struct rpc_rqst *req) { /* No allocated request or transport? return true */ if (!req || !req->rq_xprt) return true; return xprt_connected(req->rq_xprt); } static void rpc_check_timeout(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; if (RPC_SIGNALLED(task)) return; if (xprt_adjust_timeout(task->tk_rqstp) == 0) return; trace_rpc_timeout_status(task); task->tk_timeouts++; if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) { rpc_call_rpcerror(task, -ETIMEDOUT); return; } if (RPC_IS_SOFT(task)) { /* * Once a "no retrans timeout" soft tasks (a.k.a NFSv4) has * been sent, it should time out only if the transport * connection gets terminally broken. */ if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) && rpc_check_connected(task->tk_rqstp)) return; if (clnt->cl_chatty) { pr_notice_ratelimited( "%s: server %s not responding, timed out\n", clnt->cl_program->name, task->tk_xprt->servername); } if (task->tk_flags & RPC_TASK_TIMEOUT) rpc_call_rpcerror(task, -ETIMEDOUT); else __rpc_call_rpcerror(task, -EIO, -ETIMEDOUT); return; } if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; if (clnt->cl_chatty) { pr_notice_ratelimited( "%s: server %s not responding, still trying\n", clnt->cl_program->name, task->tk_xprt->servername); } } rpc_force_rebind(clnt); /* * Did our request time out due to an RPCSEC_GSS out-of-sequence * event? RFC2203 requires the server to drop all such requests. */ rpcauth_invalcred(task); } /* * 7. Decode the RPC reply */ static void call_decode(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; int err; if (!task->tk_msg.rpc_proc->p_decode) { task->tk_action = rpc_exit_task; return; } if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { pr_notice_ratelimited("%s: server %s OK\n", clnt->cl_program->name, task->tk_xprt->servername); } task->tk_flags &= ~RPC_CALL_MAJORSEEN; } /* * Did we ever call xprt_complete_rqst()? If not, we should assume * the message is incomplete. */ err = -EAGAIN; if (!req->rq_reply_bytes_recvd) goto out; /* Ensure that we see all writes made by xprt_complete_rqst() * before it changed req->rq_reply_bytes_recvd. */ smp_rmb(); req->rq_rcv_buf.len = req->rq_private_buf.len; trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf); /* Check that the softirq receive buffer is valid */ WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, sizeof(req->rq_rcv_buf)) != 0); xdr_init_decode(&xdr, &req->rq_rcv_buf, req->rq_rcv_buf.head[0].iov_base, req); err = rpc_decode_header(task, &xdr); out: switch (err) { case 0: task->tk_actio