Total coverage: 268220 (18%)of 1563737
24 9 7 8 8 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FUTEX_H #define _ASM_X86_FUTEX_H #ifdef __KERNEL__ #include <linux/futex.h> #include <linux/uaccess.h> #include <asm/asm.h> #include <asm/errno.h> #include <asm/processor.h> #include <asm/smap.h> #define unsafe_atomic_op1(insn, oval, uaddr, oparg, label) \ do { \ int oldval = 0, ret; \ asm volatile("1:\t" insn "\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %1) \ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ : "0" (oparg), "1" (0)); \ if (ret) \ goto label; \ *oval = oldval; \ } while(0) #define unsafe_atomic_op2(insn, oval, uaddr, oparg, label) \ do { \ int oldval = 0, ret, tem; \ asm volatile("1:\tmovl %2, %0\n" \ "2:\tmovl\t%0, %3\n" \ "\t" insn "\n" \ "3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \ "\tjnz\t2b\n" \ "4:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 4b, EX_TYPE_EFAULT_REG, %1) \ _ASM_EXTABLE_TYPE_REG(3b, 4b, EX_TYPE_EFAULT_REG, %1) \ : "=&a" (oldval), "=&r" (ret), \ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "1" (0)); \ if (ret) \ goto label; \ *oval = oldval; \ } while(0) static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { if (can_do_masked_user_access()) uaddr = masked_user_access_begin(uaddr); else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; switch (op) { case FUTEX_OP_SET: unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault); break; case FUTEX_OP_ADD: unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, uaddr, oparg, Efault); break; case FUTEX_OP_OR: unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault); break; case FUTEX_OP_ANDN: unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault); break; case FUTEX_OP_XOR: unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault); break; default: user_access_end(); return -ENOSYS; } user_access_end(); return 0; Efault: user_access_end(); return -EFAULT; } static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { int ret = 0; if (can_do_masked_user_access()) uaddr = masked_user_access_begin(uaddr); else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; asm volatile("\n" "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" "2:\n" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \ : "+r" (ret), "=a" (oldval), "+m" (*uaddr) : "r" (newval), "1" (oldval) : "memory" ); user_access_end(); *uval = oldval; return ret; } #endif #endif /* _ASM_X86_FUTEX_H */
1853 1858 127 1729 1152 1072 560 2 558 658 1 655 13542 467 13539 18827 10700 14854 16116 1652 1271 18647 18481 1220 99 97 96 3 98 74 98 30309 29643 11387 30273 685 37 14735 19646 18541 13381 30240 1140 11756 30284 2254 3051 30284 48 698 37 48 686 59 30321 3168 6666 30314 75 76 30437 12325 30302 3 59 3 1 1 8778 31604 31342 8779 2 119 3 119 31537 31591 27720 19328 20 20 52 52 31586 50 31722 374 1 1 1 374 375 76 76 20 20 14 20 20 20 20 20 8 2267 2260 9 105 27 2 41 61 105 69 1 69 49 68 49 69 69 29 29 38 37 1 24 31 2 32 32 4 4 28 63 53 63 63 652 658 658 869 872 870 716 874 713 870 715 34 624 17 208 302 736 149 32 27 6 901 33 871 907 506 511 1408 6173 540 529 541 542 542 4 99 99 99 99 99 99 99 99 99 99 2 2 2 2 2 2 2 2 8756 1567 68 32 37 540 6173 75 3 12 99 7 2262 2 3 20 298 33104 790 4 202 33262 32866 137 672 1 700 13320 32719 794 32770 204 32934 25665 29179 13630 31598 8758 898 2019 5995 8496 25643 130 186 17962 795 204 32975 33116 28575 30395 795 204 13634 1 696 13678 388 13631 31591 8754 8780 898 8493 25610 541 82 51 182 4 11631 16845 25571 30949 13045 13386 13 27785 1174 1171 10 13833 405 405 7 366 2 2 2 3 2 2 2 9 5 12 392 933 825 931 44 2 881 743 747 10 20 733 11 11 728 3 725 12 718 656 55 6 2 2 3 656 44 709 935 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/vsprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */ /* * Wirzenius wrote this portably, Torvalds fucked it up :-) */ /* * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com> * - changed to provide snprintf and vsnprintf functions * So Feb 1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de> * - scnprintf and vscnprintf */ #include <linux/stdarg.h> #include <linux/build_bug.h> #include <linux/clk.h> #include <linux/clk-provider.h> #include <linux/errname.h> #include <linux/module.h> /* for KSYM_SYMBOL_LEN */ #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/kernel.h> #include <linux/kallsyms.h> #include <linux/math64.h> #include <linux/uaccess.h> #include <linux/ioport.h> #include <linux/dcache.h> #include <linux/cred.h> #include <linux/rtc.h> #include <linux/sprintf.h> #include <linux/time.h> #include <linux/uuid.h> #include <linux/of.h> #include <net/addrconf.h> #include <linux/siphash.h> #include <linux/compiler.h> #include <linux/property.h> #include <linux/notifier.h> #ifdef CONFIG_BLOCK #include <linux/blkdev.h> #endif #include "../mm/internal.h" /* For the trace_print_flags arrays */ #include <asm/page.h> /* for PAGE_SIZE */ #include <asm/byteorder.h> /* cpu_to_le16 */ #include <linux/unaligned.h> #include <linux/string_helpers.h> #include "kstrtox.h" /* Disable pointer hashing if requested */ bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); noinline static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars) { const char *cp; unsigned long long result = 0ULL; size_t prefix_chars; unsigned int rv; cp = _parse_integer_fixup_radix(startp, &base); prefix_chars = cp - startp; if (prefix_chars < max_chars) { rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars); /* FIXME */ cp += (rv & ~KSTRTOX_OVERFLOW); } else { /* Field too short for prefix + digit, skip over without converting */ cp = startp + max_chars; } if (endp) *endp = (char *)cp; return result; } /** * simple_strtoull - convert a string to an unsigned long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoull instead. */ noinline unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { return simple_strntoull(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoull); /** * simple_strtoul - convert a string to an unsigned long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoul instead. */ unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base) { return simple_strtoull(cp, endp, base); } EXPORT_SYMBOL(simple_strtoul); /** * simple_strtol - convert a string to a signed long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtol instead. */ long simple_strtol(const char *cp, char **endp, unsigned int base) { if (*cp == '-') return -simple_strtoul(cp + 1, endp, base); return simple_strtoul(cp, endp, base); } EXPORT_SYMBOL(simple_strtol); noinline static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars) { /* * simple_strntoull() safely handles receiving max_chars==0 in the * case cp[0] == '-' && max_chars == 1. * If max_chars == 0 we can drop through and pass it to simple_strntoull() * and the content of *cp is irrelevant. */ if (*cp == '-' && max_chars > 0) return -simple_strntoull(cp + 1, endp, base, max_chars - 1); return simple_strntoull(cp, endp, base, max_chars); } /** * simple_strtoll - convert a string to a signed long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoll instead. */ long long simple_strtoll(const char *cp, char **endp, unsigned int base) { return simple_strntoll(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoll); static noinline_for_stack int skip_atoi(const char **s) { int i = 0; do { i = i*10 + *((*s)++) - '0'; } while (isdigit(**s)); return i; } /* * Decimal conversion is by far the most typical, and is used for * /proc and /sys data. This directly impacts e.g. top performance * with many processes running. We optimize it for speed by emitting * two characters at a time, using a 200 byte lookup table. This * roughly halves the number of multiplications compared to computing * the digits one at a time. Implementation strongly inspired by the * previous version, which in turn used ideas described at * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission * from the author, Douglas W. Jones). * * It turns out there is precisely one 26 bit fixed-point * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual * range happens to be somewhat larger (x <= 1073741898), but that's * irrelevant for our purpose. * * For dividing a number in the range [10^4, 10^6-1] by 100, we still * need a 32x32->64 bit multiply, so we simply use the same constant. * * For dividing a number in the range [100, 10^4-1] by 100, there are * several options. The simplest is (x * 0x147b) >> 19, which is valid * for all x <= 43698. */ static const u16 decpair[100] = { #define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030) _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9), _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19), _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29), _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39), _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49), _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59), _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69), _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79), _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89), _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99), #undef _ }; /* * This will print a single '0' even if r == 0, since we would * immediately jump to out_r where two 0s would be written but only * one of them accounted for in buf. This is needed by ip4_string * below. All other callers pass a non-zero value of r. */ static noinline_for_stack char *put_dec_trunc8(char *buf, unsigned r) { unsigned q; /* 1 <= r < 10^8 */ if (r < 100) goto out_r; /* 100 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 1 <= q < 10^6 */ if (q < 100) goto out_q; /* 100 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 1 <= r < 10^4 */ if (r < 100) goto out_r; /* 100 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; out_q: /* 1 <= q < 100 */ r = q; out_r: /* 1 <= r < 100 */ *((u16 *)buf) = decpair[r]; buf += r < 10 ? 1 : 2; return buf; } #if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64 static noinline_for_stack char *put_dec_full8(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; buf += 2; return buf; } static noinline_for_stack char *put_dec(char *buf, unsigned long long n) { if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n <= 1.6e11 */ if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n < 1e8 */ return put_dec_trunc8(buf, n); } #elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64 static void put_dec_full4(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; } /* * Call put_dec_full4 on x % 10000, return x / 10000. * The approximation x/10000 == (x * 0x346DC5D7) >> 43 * holds for all x < 1,128,869,999. The largest value this * helper will ever be asked to convert is 1,125,520,955. * (second call in the put_dec code, assuming n is all-ones). */ static noinline_for_stack unsigned put_dec_helper4(char *buf, unsigned x) { uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; put_dec_full4(buf, x - q * 10000); return q; } /* Based on code by Douglas W. Jones found at * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> * (with permission from the author). * Performs no 64-bit division and hence should be fast on 32-bit machines. */ static char *put_dec(char *buf, unsigned long long n) { uint32_t d3, d2, d1, q, h; if (n < 100*1000*1000) return put_dec_trunc8(buf, n); d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ h = (n >> 32); d2 = (h ) & 0xffff; d3 = (h >> 16); /* implicit "& 0xffff" */ /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); q = put_dec_helper4(buf, q); q += 7671 * d3 + 9496 * d2 + 6 * d1; q = put_dec_helper4(buf+4, q); q += 4749 * d3 + 42 * d2; q = put_dec_helper4(buf+8, q); q += 281 * d3; buf += 12; if (q) buf = put_dec_trunc8(buf, q); else while (buf[-1] == '0') --buf; return buf; } #endif /* * Convert passed number to decimal string. * Returns the length of string. On buffer overflow, returns 0. * * If speed is not important, use snprintf(). It's easy to read the code. */ int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[sizeof(num) * 3] __aligned(2); int idx, len; /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ if (num <= 9) { tmp[0] = '0' + num; len = 1; } else { len = put_dec(tmp, num) - tmp; } if (len > size || width > size) return 0; if (width > len) { width = width - len; for (idx = 0; idx < width; idx++) buf[idx] = ' '; } else { width = 0; } for (idx = 0; idx < len; ++idx) buf[idx + width] = tmp[len - idx - 1]; return len + width; } #define SIGN 1 /* unsigned/signed, must be 1 */ #define LEFT 2 /* left justified */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define ZEROPAD 16 /* pad with zero, must be 16 == '0' - ' ' */ #define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */ #define SPECIAL 64 /* prefix hex with "0x", octal with "0" */ static_assert(SIGN == 1); static_assert(ZEROPAD == ('0' - ' ')); static_assert(SMALL == ('a' ^ 'A')); enum format_type { FORMAT_TYPE_NONE, /* Just a string part */ FORMAT_TYPE_WIDTH, FORMAT_TYPE_PRECISION, FORMAT_TYPE_CHAR, FORMAT_TYPE_STR, FORMAT_TYPE_PTR, FORMAT_TYPE_PERCENT_CHAR, FORMAT_TYPE_INVALID, FORMAT_TYPE_LONG_LONG, FORMAT_TYPE_ULONG, FORMAT_TYPE_LONG, FORMAT_TYPE_UBYTE, FORMAT_TYPE_BYTE, FORMAT_TYPE_USHORT, FORMAT_TYPE_SHORT, FORMAT_TYPE_UINT, FORMAT_TYPE_INT, FORMAT_TYPE_SIZE_T, FORMAT_TYPE_PTRDIFF }; struct printf_spec { unsigned int type:8; /* format_type enum */ signed int field_width:24; /* width of output field */ unsigned int flags:8; /* flags to number() */ unsigned int base:8; /* number base, 8, 10 or 16 only */ signed int precision:16; /* # of digits/chars */ } __packed; static_assert(sizeof(struct printf_spec) == 8); #define FIELD_WIDTH_MAX ((1 << 23) - 1) #define PRECISION_MAX ((1 << 15) - 1) static noinline_for_stack char *number(char *buf, char *end, unsigned long long num, struct printf_spec spec) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[3 * sizeof(num)] __aligned(2); char sign; char locase; int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); int i; bool is_zero = num == 0LL; int field_width = spec.field_width; int precision = spec.precision; /* locase = 0 or 0x20. ORing digits or letters with 'locase' * produces same digits or (maybe lowercased) letters */ locase = (spec.flags & SMALL); if (spec.flags & LEFT) spec.flags &= ~ZEROPAD; sign = 0; if (spec.flags & SIGN) { if ((signed long long)num < 0) { sign = '-'; num = -(signed long long)num; field_width--; } else if (spec.flags & PLUS) { sign = '+'; field_width--; } else if (spec.flags & SPACE) { sign = ' '; field_width--; } } if (need_pfx) { if (spec.base == 16) field_width -= 2; else if (!is_zero) field_width--; } /* generate full string in tmp[], in reverse order */ i = 0; if (num < spec.base) tmp[i++] = hex_asc_upper[num] | locase; else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; if (spec.base == 16) shift = 4; do { tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); num >>= shift; } while (num); } else { /* base 10 */ i = put_dec(tmp, num) - tmp; } /* printing 100 using %2d gives "100", not "00" */ if (i > precision) precision = i; /* leading space padding */ field_width -= precision; if (!(spec.flags & (ZEROPAD | LEFT))) { while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } } /* sign */ if (sign) { if (buf < end) *buf = sign; ++buf; } /* "0x" / "0" prefix */ if (need_pfx) { if (spec.base == 16 || !is_zero) { if (buf < end) *buf = '0'; ++buf; } if (spec.base == 16) { if (buf < end) *buf = ('X' | locase); ++buf; } } /* zero or space padding */ if (!(spec.flags & LEFT)) { char c = ' ' + (spec.flags & ZEROPAD); while (--field_width >= 0) { if (buf < end) *buf = c; ++buf; } } /* hmm even more zero padding? */ while (i <= --precision) { if (buf < end) *buf = '0'; ++buf; } /* actual digits of result */ while (--i >= 0) { if (buf < end) *buf = tmp[i]; ++buf; } /* trailing space padding */ while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } return buf; } static noinline_for_stack char *special_hex_number(char *buf, char *end, unsigned long long num, int size) { struct printf_spec spec; spec.type = FORMAT_TYPE_PTR; spec.field_width = 2 + 2 * size; /* 0x + hex */ spec.flags = SPECIAL | SMALL | ZEROPAD; spec.base = 16; spec.precision = -1; return number(buf, end, num, spec); } static void move_right(char *buf, char *end, unsigned len, unsigned spaces) { size_t size; if (buf >= end) /* nowhere to put anything */ return; size = end - buf; if (size <= spaces) { memset(buf, ' ', size); return; } if (len) { if (len > size - spaces) len = size - spaces; memmove(buf + spaces, buf, len); } memset(buf, ' ', spaces); } /* * Handle field width padding for a string. * @buf: current buffer position * @n: length of string * @end: end of output buffer * @spec: for field width and flags * Returns: new buffer position after padding. */ static noinline_for_stack char *widen_string(char *buf, int n, char *end, struct printf_spec spec) { unsigned spaces; if (likely(n >= spec.field_width)) return buf; /* we want to pad the sucker */ spaces = spec.field_width - n; if (!(spec.flags & LEFT)) { move_right(buf - n, end, n, spaces); return buf + spaces; } while (spaces--) { if (buf < end) *buf = ' '; ++buf; } return buf; } /* Handle string from a well known address. */ static char *string_nocheck(char *buf, char *end, const char *s, struct printf_spec spec) { int len = 0; int lim = spec.precision; while (lim--) { char c = *s++; if (!c) break; if (buf < end) *buf = c; ++buf; ++len; } return widen_string(buf, len, end, spec); } static char *err_ptr(char *buf, char *end, void *ptr, struct printf_spec spec) { int err = PTR_ERR(ptr); const char *sym = errname(err); if (sym) return string_nocheck(buf, end, sym, spec); /* * Somebody passed ERR_PTR(-1234) or some other non-existing * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to * printing it as its decimal representation. */ spec.flags |= SIGN; spec.base = 10; return number(buf, end, err, spec); } /* Be careful: error messages must fit into the given buffer. */ static char *error_string(char *buf, char *end, const char *s, struct printf_spec spec) { /* * Hard limit to avoid a completely insane messages. It actually * works pretty well because most error messages are in * the many pointer format modifiers. */ if (spec.precision == -1) spec.precision = 2 * sizeof(void *); return string_nocheck(buf, end, s, spec); } /* * Do not call any complex external code here. Nested printk()/vsprintf() * might cause infinite loops. Failures might break printk() and would * be hard to debug. */ static const char *check_pointer_msg(const void *ptr) { if (!ptr) return "(null)"; if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr)) return "(efault)"; return NULL; } static int check_pointer(char **buf, char *end, const void *ptr, struct printf_spec spec) { const char *err_msg; err_msg = check_pointer_msg(ptr); if (err_msg) { *buf = error_string(*buf, end, err_msg, spec); return -EFAULT; } return 0; } static noinline_for_stack char *string(char *buf, char *end, const char *s, struct printf_spec spec) { if (check_pointer(&buf, end, s, spec)) return buf; return string_nocheck(buf, end, s, spec); } static char *pointer_string(char *buf, char *end, const void *ptr, struct printf_spec spec) { spec.base = 16; spec.flags |= SMALL; if (spec.field_width == -1) { spec.field_width = 2 * sizeof(ptr); spec.flags |= ZEROPAD; } return number(buf, end, (unsigned long int)ptr, spec); } /* Make pointers available for printing early in the boot sequence. */ static int debug_boot_weak_hash __ro_after_init; static int __init debug_boot_weak_hash_enable(char *str) { debug_boot_weak_hash = 1; pr_info("debug_boot_weak_hash enabled\n"); return 0; } early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); static bool filled_random_ptr_key __read_mostly; static siphash_key_t ptr_key __read_mostly; static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data) { get_random_bytes(&ptr_key, sizeof(ptr_key)); /* Pairs with smp_rmb() before reading ptr_key. */ smp_wmb(); WRITE_ONCE(filled_random_ptr_key, true); return NOTIFY_DONE; } static int __init vsprintf_init_hashval(void) { static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key }; execute_with_initialized_rng(&fill_ptr_key_nb); return 0; } subsys_initcall(vsprintf_init_hashval) /* Maps a pointer to a 32 bit unique identifier. */ static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { unsigned long hashval; if (!READ_ONCE(filled_random_ptr_key)) return -EBUSY; /* Pairs with smp_wmb() after writing ptr_key. */ smp_rmb(); #ifdef CONFIG_64BIT hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); /* * Mask off the first 32 bits, this makes explicit that we have * modified the address (and 32 bits is plenty for a unique ID). */ hashval = hashval & 0xffffffff; #else hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); #endif *hashval_out = hashval; return 0; } int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { return __ptr_to_hashval(ptr, hashval_out); } static char *ptr_to_id(char *buf, char *end, const void *ptr, struct printf_spec spec) { const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; unsigned long hashval; int ret; /* * Print the real pointer value for NULL and error pointers, * as they are not actual addresses. */ if (IS_ERR_OR_NULL(ptr)) return pointer_string(buf, end, ptr, spec); /* When debugging early boot use non-cryptographically secure hash. */ if (unlikely(debug_boot_weak_hash)) { hashval = hash_long((unsigned long)ptr, 32); return pointer_string(buf, end, (const void *)hashval, spec); } ret = __ptr_to_hashval(ptr, &hashval); if (ret) { spec.field_width = 2 * sizeof(ptr); /* string length must be less than default_width */ return error_string(buf, end, str, spec); } return pointer_string(buf, end, (const void *)hashval, spec); } static char *default_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { /* * default is to _not_ leak addresses, so hash before printing, * unless no_hash_pointers is specified on the command line. */ if (unlikely(no_hash_pointers)) return pointer_string(buf, end, ptr, spec); return ptr_to_id(buf, end, ptr, spec); } int kptr_restrict __read_mostly; static noinline_for_stack char *restricted_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { switch (kptr_restrict) { case 0: /* Handle as %p, hash and do _not_ leak addresses. */ return default_pointer(buf, end, ptr, spec); case 1: { const struct cred *cred; /* * kptr_restrict==1 cannot be used in IRQ context * because its test for CAP_SYSLOG would be meaningless. */ if (in_hardirq() || in_serving_softirq() || in_nmi()) { if (spec.field_width == -1) spec.field_width = 2 * sizeof(ptr); return error_string(buf, end, "pK-error", spec); } /* * Only print the real pointer value if the current * process has CAP_SYSLOG and is running with the * same credentials it started with. This is because * access to files is checked at open() time, but %pK * checks permission at read() time. We don't want to * leak pointer values if a binary opens a file using * %pK and then elevates privileges before reading it. */ cred = current_cred(); if (!has_capability_noaudit(current, CAP_SYSLOG) || !uid_eq(cred->euid, cred->uid) || !gid_eq(cred->egid, cred->gid)) ptr = NULL; break; } case 2: default: /* Always print 0's for %pK */ ptr = NULL; break; } return pointer_string(buf, end, ptr, spec); } static noinline_for_stack char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, const char *fmt) { const char *array[4], *s; const struct dentry *p; int depth; int i, n; switch (fmt[1]) { case '2': case '3': case '4': depth = fmt[1] - '0'; break; default: depth = 1; } rcu_read_lock(); for (i = 0; i < depth; i++, d = p) { if (check_pointer(&buf, end, d, spec)) { rcu_read_unlock(); return buf; } p = READ_ONCE(d->d_parent); array[i] = READ_ONCE(d->d_name.name); if (p == d) { if (i) array[i] = ""; i++; break; } } s = array[--i]; for (n = 0; n != spec.precision; n++, buf++) { char c = *s++; if (!c) { if (!i) break; c = '/'; s = array[--i]; } if (buf < end) *buf = c; } rcu_read_unlock(); return widen_string(buf, n, end, spec); } static noinline_for_stack char *file_dentry_name(char *buf, char *end, const struct file *f, struct printf_spec spec, const char *fmt) { if (check_pointer(&buf, end, f, spec)) return buf; return dentry_name(buf, end, f->f_path.dentry, spec, fmt); } #ifdef CONFIG_BLOCK static noinline_for_stack char *bdev_name(char *buf, char *end, struct block_device *bdev, struct printf_spec spec, const char *fmt) { struct gendisk *hd; if (check_pointer(&buf, end, bdev, spec)) return buf; hd = bdev->bd_disk; buf = string(buf, end, hd->disk_name, spec); if (bdev_is_partition(bdev)) { if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) { if (buf < end) *buf = 'p'; buf++; } buf = number(buf, end, bdev_partno(bdev), spec); } return buf; } #endif static noinline_for_stack char *symbol_string(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { unsigned long value; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; #endif if (fmt[1] == 'R') ptr = __builtin_extract_return_addr(ptr); value = (unsigned long)ptr; #ifdef CONFIG_KALLSYMS if (*fmt == 'B' && fmt[1] == 'b') sprint_backtrace_build_id(sym, value); else if (*fmt == 'B') sprint_backtrace(sym, value); else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b'))) sprint_symbol_build_id(sym, value); else if (*fmt != 's') sprint_symbol(sym, value); else sprint_symbol_no_offset(sym, value); return string_nocheck(buf, end, sym, spec); #else return special_hex_number(buf, end, value, sizeof(void *)); #endif } static const struct printf_spec default_str_spec = { .field_width = -1, .precision = -1, }; static const struct printf_spec default_flag_spec = { .base = 16, .precision = -1, .flags = SPECIAL | SMALL, }; static const struct printf_spec default_dec_spec = { .base = 10, .precision = -1, }; static const struct printf_spec default_dec02_spec = { .base = 10, .field_width = 2, .precision = -1, .flags = ZEROPAD, }; static const struct printf_spec default_dec04_spec = { .base = 10, .field_width = 4, .precision = -1, .flags = ZEROPAD, }; static noinline_for_stack char *hex_range(char *buf, char *end, u64 start_val, u64 end_val, struct printf_spec spec) { buf = number(buf, end, start_val, spec); if (start_val == end_val) return buf; if (buf < end) *buf = '-'; ++buf; return number(buf, end, end_val, spec); } static noinline_for_stack char *resource_string(char *buf, char *end, struct resource *res, struct printf_spec spec, const char *fmt) { #ifndef IO_RSRC_PRINTK_SIZE #define IO_RSRC_PRINTK_SIZE 6 #endif #ifndef MEM_RSRC_PRINTK_SIZE #define MEM_RSRC_PRINTK_SIZE 10 #endif static const struct printf_spec io_spec = { .base = 16, .field_width = IO_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec mem_spec = { .base = 16, .field_width = MEM_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec bus_spec = { .base = 16, .field_width = 2, .precision = -1, .flags = SMALL | ZEROPAD, }; static const struct printf_spec str_spec = { .field_width = -1, .precision = 10, .flags = LEFT, }; /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8) * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */ #define RSRC_BUF_SIZE ((2 * sizeof(resource_size_t)) + 4) #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; char *p = sym, *pend = sym + sizeof(sym); int decode = (fmt[0] == 'R') ? 1 : 0; const struct printf_spec *specp; if (check_pointer(&buf, end, res, spec)) return buf; *p++ = '['; if (res->flags & IORESOURCE_IO) { p = string_nocheck(p, pend, "io ", str_spec); specp = &io_spec; } else if (res->flags & IORESOURCE_MEM) { p = string_nocheck(p, pend, "mem ", str_spec); specp = &mem_spec; } else if (res->flags & IORESOURCE_IRQ) { p = string_nocheck(p, pend, "irq ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_DMA) { p = string_nocheck(p, pend, "dma ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_BUS) { p = string_nocheck(p, pend, "bus ", str_spec); specp = &bus_spec; } else { p = string_nocheck(p, pend, "??? ", str_spec); specp = &mem_spec; decode = 0; } if (decode && res->flags & IORESOURCE_UNSET) { p = string_nocheck(p, pend, "size ", str_spec); p = number(p, pend, resource_size(res), *specp); } else { p = hex_range(p, pend, res->start, res->end, *specp); } if (decode) { if (res->flags & IORESOURCE_MEM_64) p = string_nocheck(p, pend, " 64bit", str_spec); if (res->flags & IORESOURCE_PREFETCH) p = string_nocheck(p, pend, " pref", str_spec); if (res->flags & IORESOURCE_WINDOW) p = string_nocheck(p, pend, " window", str_spec); if (res->flags & IORESOURCE_DISABLED) p = string_nocheck(p, pend, " disabled", str_spec); } else { p = string_nocheck(p, pend, " flags ", str_spec); p = number(p, pend, res->flags, default_flag_spec); } *p++ = ']'; *p = '\0'; return string_nocheck(buf, end, sym, spec); } static noinline_for_stack char *range_string(char *buf, char *end, const struct range *range, struct printf_spec spec, const char *fmt) { char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")]; char *p = sym, *pend = sym + sizeof(sym); struct printf_spec range_spec = { .field_width = 2 + 2 * sizeof(range->start), /* 0x + 2 * 8 */ .flags = SPECIAL | SMALL | ZEROPAD, .base = 16, .precision = -1, }; if (check_pointer(&buf, end, range, spec)) return buf; p = string_nocheck(p, pend, "[range ", default_str_spec); p = hex_range(p, pend, range->start, range->end, range_spec); *p++ = ']'; *p = '\0'; return string_nocheck(buf, end, sym, spec); } static noinline_for_stack char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { int i, len = 1; /* if we pass '%ph[CDN]', field width remains negative value, fallback to the default */ char separator; if (spec.field_width == 0) /* nothing to print */ return buf; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'C': separator = ':'; break; case 'D': separator = '-'; break; case 'N': separator = 0; break; default: separator = ' '; break; } if (spec.field_width > 0) len = min_t(int, spec.field_width, 64); for (i = 0; i < len; ++i) { if (buf < end) *buf = hex_asc_hi(addr[i]); ++buf; if (buf < end) *buf = hex_asc_lo(addr[i]); ++buf; if (separator && i != len - 1) { if (buf < end) *buf = separator; ++buf; } } return buf; } static noinline_for_stack char *bitmap_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { const int CHUNKSZ = 32; int nr_bits = max_t(int, spec.field_width, 0); int i, chunksz; bool first = true; if (check_pointer(&buf, end, bitmap, spec)) return buf; /* reused to print numbers */ spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 }; chunksz = nr_bits & (CHUNKSZ - 1); if (chunksz == 0) chunksz = CHUNKSZ; i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ; for (; i >= 0; i -= CHUNKSZ) { u32 chunkmask, val; int word, bit; chunkmask = ((1ULL << chunksz) - 1); word = i / BITS_PER_LONG; bit = i % BITS_PER_LONG; val = (bitmap[word] >> bit) & chunkmask; if (!first) { if (buf < end) *buf = ','; buf++; } first = false; spec.field_width = DIV_ROUND_UP(chunksz, 4); buf = number(buf, end, val, spec); chunksz = CHUNKSZ; } return buf; } static noinline_for_stack char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { int nr_bits = max_t(int, spec.field_width, 0); bool first = true; int rbot, rtop; if (check_pointer(&buf, end, bitmap, spec)) return buf; for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { if (!first) { if (buf < end) *buf = ','; buf++; } first = false; buf = number(buf, end, rbot, default_dec_spec); if (rtop == rbot + 1) continue; if (buf < end) *buf = '-'; buf = number(++buf, end, rtop - 1, default_dec_spec); } return buf; } static noinline_for_stack char *mac_address_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")]; char *p = mac_addr; int i; char separator; bool reversed = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': separator = '-'; break; case 'R': reversed = true; fallthrough; default: separator = ':'; break; } for (i = 0; i < 6; i++) { if (reversed) p = hex_byte_pack(p, addr[5 - i]); else p = hex_byte_pack(p, addr[i]); if (fmt[0] == 'M' && i != 5) *p++ = separator; } *p = '\0'; return string_nocheck(buf, end, mac_addr, spec); } static noinline_for_stack char *ip4_string(char *p, const u8 *addr, const char *fmt) { int i; bool leading_zeros = (fmt[0] == 'i'); int index; int step; switch (fmt[2]) { case 'h': #ifdef __BIG_ENDIAN index = 0; step = 1; #else index = 3; step = -1; #endif break; case 'l': index = 3; step = -1; break; case 'n': case 'b': default: index = 0; step = 1; break; } for (i = 0; i < 4; i++) { char temp[4] __aligned(2); /* hold each IP quad in reverse order */ int digits = put_dec_trunc8(temp, addr[index]) - temp; if (leading_zeros) { if (digits < 3) *p++ = '0'; if (digits < 2) *p++ = '0'; } /* reverse the digits in the quad */ while (digits--) *p++ = temp[digits]; if (i < 3) *p++ = '.'; index += step; } *p = '\0'; return p; } static noinline_for_stack char *ip6_compressed_string(char *p, const char *addr) { int i, j, range; unsigned char zerolength[8]; int longest = 1; int colonpos = -1; u16 word; u8 hi, lo; bool needcolon = false; bool useIPv4; struct in6_addr in6; memcpy(&in6, addr, sizeof(struct in6_addr)); useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6); memset(zerolength, 0, sizeof(zerolength)); if (useIPv4) range = 6; else range = 8; /* find position of longest 0 run */ for (i = 0; i < range; i++) { for (j = i; j < range; j++) { if (in6.s6_addr16[j] != 0) break; zerolength[i]++; } } for (i = 0; i < range; i++) { if (zerolength[i] > longest) { longest = zerolength[i]; colonpos = i; } } if (longest == 1) /* don't compress a single 0 */ colonpos = -1; /* emit address */ for (i = 0; i < range; i++) { if (i == colonpos) { if (needcolon || i == 0) *p++ = ':'; *p++ = ':'; needcolon = false; i += longest - 1; continue; } if (needcolon) { *p++ = ':'; needcolon = false; } /* hex u16 without leading 0s */ word = ntohs(in6.s6_addr16[i]); hi = word >> 8; lo = word & 0xff; if (hi) { if (hi > 0x0f) p = hex_byte_pack(p, hi); else *p++ = hex_asc_lo(hi); p = hex_byte_pack(p, lo); } else if (lo > 0x0f) p = hex_byte_pack(p, lo); else *p++ = hex_asc_lo(lo); needcolon = true; } if (useIPv4) { if (needcolon) *p++ = ':'; p = ip4_string(p, &in6.s6_addr[12], "I4"); } *p = '\0'; return p; } static noinline_for_stack char *ip6_string(char *p, const char *addr, const char *fmt) { int i; for (i = 0; i < 8; i++) { p = hex_byte_pack(p, *addr++); p = hex_byte_pack(p, *addr++); if (fmt[0] == 'I' && i != 7) *p++ = ':'; } *p = '\0'; return p; } static noinline_for_stack char *ip6_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")]; if (fmt[0] == 'I' && fmt[2] == 'c') ip6_compressed_string(ip6_addr, addr); else ip6_string(ip6_addr, addr, fmt); return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip4_addr[sizeof("255.255.255.255")]; ip4_string(ip4_addr, addr, fmt); return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, struct printf_spec spec, const char *fmt) { bool have_p = false, have_s = false, have_f = false, have_c = false; char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") + sizeof(":12345") + sizeof("/123456789") + sizeof("%1234567890")]; char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr); const u8 *addr = (const u8 *) &sa->sin6_addr; char fmt6[2] = { fmt[0], '6' }; u8 off = 0; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'f': have_f = true; break; case 's': have_s = true; break; case 'c': have_c = true; break; } } if (have_p || have_s || have_f) { *p = '['; off = 1; } if (fmt6[0] == 'I' && have_c) p = ip6_compressed_string(ip6_addr + off, addr); else p = ip6_string(ip6_addr + off, addr, fmt6); if (have_p || have_s || have_f) *p++ = ']'; if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin6_port), spec); } if (have_f) { *p++ = '/'; p = number(p, pend, ntohl(sa->sin6_flowinfo & IPV6_FLOWINFO_MASK), spec); } if (have_s) { *p++ = '%'; p = number(p, pend, sa->sin6_scope_id, spec); } *p = '\0'; return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, struct printf_spec spec, const char *fmt) { bool have_p = false; char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")]; char *pend = ip4_addr + sizeof(ip4_addr); const u8 *addr = (const u8 *) &sa->sin_addr.s_addr; char fmt4[3] = { fmt[0], '4', 0 }; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'h': case 'l': case 'n': case 'b': fmt4[2] = *fmt; break; } } p = ip4_string(ip4_addr, addr, fmt4); if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin_port), spec); } *p = '\0'; return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip_addr_string(char *buf, char *end, const void *ptr, struct printf_spec spec, const char *fmt) { char *err_fmt_msg; if (check_pointer(&buf, end, ptr, spec)) return buf; switch (fmt[1]) { case '6': return ip6_addr_string(buf, end, ptr, spec, fmt); case '4': return ip4_addr_string(buf, end, ptr, spec, fmt); case 'S': { const union { struct sockaddr raw; struct sockaddr_in v4; struct sockaddr_in6 v6; } *sa = ptr; switch (sa->raw.sa_family) { case AF_INET: return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt); case AF_INET6: return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt); default: return error_string(buf, end, "(einval)", spec); }} } err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)"; return error_string(buf, end, err_fmt_msg, spec); } static noinline_for_stack char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { bool found = true; int count = 1; unsigned int flags = 0; int len; if (spec.field_width == 0) return buf; /* nothing to print */ if (check_pointer(&buf, end, addr, spec)) return buf; do { switch (fmt[count++]) { case 'a': flags |= ESCAPE_ANY; break; case 'c': flags |= ESCAPE_SPECIAL; break; case 'h': flags |= ESCAPE_HEX; break; case 'n': flags |= ESCAPE_NULL; break; case 'o': flags |= ESCAPE_OCTAL; break; case 'p': flags |= ESCAPE_NP; break; case 's': flags |= ESCAPE_SPACE; break; default: found = false; break; } } while (found); if (!flags) flags = ESCAPE_ANY_NP; len = spec.field_width < 0 ? 1 : spec.field_width; /* * string_escape_mem() writes as many characters as it can to * the given buffer, and returns the total size of the output * had the buffer been big enough. */ buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL); return buf; } static char *va_format(char *buf, char *end, struct va_format *va_fmt, struct printf_spec spec, const char *fmt) { va_list va; if (check_pointer(&buf, end, va_fmt, spec)) return buf; va_copy(va, *va_fmt->va); buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va); va_end(va); return buf; } static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char uuid[UUID_STRING_LEN + 1]; char *p = uuid; int i; const u8 *index = uuid_index; bool uc = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (*(++fmt)) { case 'L': uc = true; fallthrough; case 'l': index = guid_index; break; case 'B': uc = true; break; } for (i = 0; i < 16; i++) { if (uc) p = hex_byte_pack_upper(p, addr[index[i]]); else p = hex_byte_pack(p, addr[index[i]]); switch (i) { case 3: case 5: case 7: case 9: *p++ = '-'; break; } } *p = 0; return string_nocheck(buf, end, uuid, spec); } static noinline_for_stack char *netdev_bits(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': num = *(const netdev_features_t *)addr; size = sizeof(netdev_features_t); break; default: return error_string(buf, end, "(%pN?)", spec); } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *fourcc_string(char *buf, char *end, const u32 *fourcc, struct printf_spec spec, const char *fmt) { char output[sizeof("0123 little-endian (0x01234567)")]; char *p = output; unsigned int i; u32 orig, val; if (fmt[1] != 'c' || fmt[2] != 'c') return error_string(buf, end, "(%p4?)", spec); if (check_pointer(&buf, end, fourcc, spec)) return buf; orig = get_unaligned(fourcc); val = orig & ~BIT(31); for (i = 0; i < sizeof(u32); i++) { unsigned char c = val >> (i * 8); /* Print non-control ASCII characters as-is, dot otherwise */ *p++ = isascii(c) && isprint(c) ? c : '.'; } *p++ = ' '; strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian"); p += strlen(p); *p++ = ' '; *p++ = '('; p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32)); *p++ = ')'; *p = '\0'; return string(buf, end, output, spec); } static noinline_for_stack char *address_val(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'd': num = *(const dma_addr_t *)addr; size = sizeof(dma_addr_t); break; case 'p': default: num = *(const phys_addr_t *)addr; size = sizeof(phys_addr_t); break; } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r) { int year = tm->tm_year + (r ? 0 : 1900); int mon = tm->tm_mon + (r ? 0 : 1); buf = number(buf, end, year, default_dec04_spec); if (buf < end) *buf = '-'; buf++; buf = number(buf, end, mon, default_dec02_spec); if (buf < end) *buf = '-'; buf++; return number(buf, end, tm->tm_mday, default_dec02_spec); } static noinline_for_stack char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r) { buf = number(buf, end, tm->tm_hour, default_dec02_spec); if (buf < end) *buf = ':'; buf++; buf = number(buf, end, tm->tm_min, default_dec02_spec); if (buf < end) *buf = ':'; buf++; return number(buf, end, tm->tm_sec, default_dec02_spec); } static noinline_for_stack char *rtc_str(char *buf, char *end, const struct rtc_time *tm, struct printf_spec spec, const char *fmt) { bool have_t = true, have_d = true; bool raw = false, iso8601_separator = true; bool found = true; int count = 2; if (check_pointer(&buf, end, tm, spec)) return buf; switch (fmt[count]) { case 'd': have_t = false; count++; break; case 't': have_d = false; count++; break; } do { switch (fmt[count++]) { case 'r': raw = true; break; case 's': iso8601_separator = false; break; default: found = false; break; } } while (found); if (have_d) buf = date_str(buf, end, tm, raw); if (have_d && have_t) { if (buf < end) *buf = iso8601_separator ? 'T' : ' '; buf++; } if (have_t) buf = time_str(buf, end, tm, raw); return buf; } static noinline_for_stack char *time64_str(char *buf, char *end, const time64_t time, struct printf_spec spec, const char *fmt) { struct rtc_time rtc_time; struct tm tm; time64_to_tm(time, 0, &tm); rtc_time.tm_sec = tm.tm_sec; rtc_time.tm_min = tm.tm_min; rtc_time.tm_hour = tm.tm_hour; rtc_time.tm_mday = tm.tm_mday; rtc_time.tm_mon = tm.tm_mon; rtc_time.tm_year = tm.tm_year; rtc_time.tm_wday = tm.tm_wday; rtc_time.tm_yday = tm.tm_yday; rtc_time.tm_isdst = 0; return rtc_str(buf, end, &rtc_time, spec, fmt); } static noinline_for_stack char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { switch (fmt[1]) { case 'R': return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt); case 'T': return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt); default: return error_string(buf, end, "(%pt?)", spec); } } static noinline_for_stack char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, const char *fmt) { if (!IS_ENABLED(CONFIG_HAVE_CLK)) return error_string(buf, end, "(%pC?)", spec); if (check_pointer(&buf, end, clk, spec)) return buf; switch (fmt[1]) { case 'n': default: #ifdef CONFIG_COMMON_CLK return string(buf, end, __clk_get_name(clk), spec); #else return ptr_to_id(buf, end, clk, spec); #endif } } static char *format_flags(char *buf, char *end, unsigned long flags, const struct trace_print_flags *names) { unsigned long mask; for ( ; flags && names->name; names++) { mask = names->mask; if ((flags & mask) != mask) continue; buf = string(buf, end, names->name, default_str_spec); flags &= ~mask; if (flags) { if (buf < end) *buf = '|'; buf++; } } if (flags) buf = number(buf, end, flags, default_flag_spec); return buf; } struct page_flags_fields { int width; int shift; int mask; const struct printf_spec *spec; const char *name; }; static const struct page_flags_fields pff[] = { {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, &default_dec_spec, "section"}, {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, &default_dec_spec, "node"}, {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, &default_dec_spec, "zone"}, {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, &default_flag_spec, "lastcpupid"}, {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, &default_flag_spec, "kasantag"}, }; static char *format_page_flags(char *buf, char *end, unsigned long flags) { unsigned long main_flags = flags & PAGEFLAGS_MASK; bool append = false; int i; buf = number(buf, end, flags, default_flag_spec); if (buf < end) *buf = '('; buf++; /* Page flags from the main area. */ if (main_flags) { buf = format_flags(buf, end, main_flags, pageflag_names); append = true; } /* Page flags from the fields area */ for (i = 0; i < ARRAY_SIZE(pff); i++) { /* Skip undefined fields. */ if (!pff[i].width) continue; /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */ if (append) { if (buf < end) *buf = '|'; buf++; } buf = string(buf, end, pff[i].name, default_str_spec); if (buf < end) *buf = '='; buf++; buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask, *pff[i].spec); append = true; } if (buf < end) *buf = ')'; buf++; return buf; } static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) { unsigned long flags; const struct trace_print_flags *names; if (check_pointer(&buf, end, flags_ptr, spec)) return buf; switch (fmt[1]) { case 'p': return format_page_flags(buf, end, *(unsigned long *)flags_ptr); case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; break; case 'g': flags = (__force unsigned long)(*(gfp_t *)flags_ptr); names = gfpflag_names; break; default: return error_string(buf, end, "(%pG?)", spec); } return format_flags(buf, end, flags, names); } static noinline_for_stack char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, char *end) { int depth; /* Loop starting from the root node to the current node. */ for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) { /* * Only get a reference for other nodes (i.e. parent nodes). * fwnode refcount may be 0 here. */ struct fwnode_handle *__fwnode = depth ? fwnode_get_nth_parent(fwnode, depth) : fwnode; buf = string(buf, end, fwnode_get_name_prefix(__fwnode), default_str_spec); buf = string(buf, end, fwnode_get_name(__fwnode), default_str_spec); if (depth) fwnode_handle_put(__fwnode); } return buf; } static noinline_for_stack char *device_node_string(char *buf, char *end, struct device_node *dn, struct printf_spec spec, const char *fmt) { char tbuf[sizeof("xxxx") + 1]; const char *p; int ret; char *buf_start = buf; struct property *prop; bool has_mult, pass; struct printf_spec str_spec = spec; str_spec.field_width = -1; if (fmt[0] != 'F') return error_string(buf, end, "(%pO?)", spec); if (!IS_ENABLED(CONFIG_OF)) return error_string(buf, end, "(%pOF?)", spec); if (check_pointer(&buf, end, dn, spec)) return buf; /* simple case without anything any more format specifiers */ fmt++; if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0) fmt = "f"; for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) { int precision; if (pass) { if (buf < end) *buf = ':'; buf++; } switch (*fmt) { case 'f': /* full_name */ buf = fwnode_full_name_string(of_fwnode_handle(dn), buf, end); break; case 'n': /* name */ p = fwnode_get_name(of_fwnode_handle(dn)); precision = str_spec.precision; str_spec.precision = strchrnul(p, '@') - p; buf = string(buf, end, p, str_spec); str_spec.precision = precision; break; case 'p': /* phandle */ buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec); break; case 'P': /* path-spec */ p = fwnode_get_name(of_fwnode_handle(dn)); if (!p[1]) p = "/"; buf = string(buf, end, p, str_spec); break; case 'F': /* flags */ tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-'; tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-'; tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-'; tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-'; tbuf[4] = 0; buf = string_nocheck(buf, end, tbuf, str_spec); break; case 'c': /* major compatible string */ ret = of_property_read_string(dn, "compatible", &p); if (!ret) buf = string(buf, end, p, str_spec); break; case 'C': /* full compatible string */ has_mult = false; of_property_for_each_string(dn, "compatible", prop, p) { if (has_mult) buf = string_nocheck(buf, end, ",", str_spec); buf = string_nocheck(buf, end, "\"", str_spec); buf = string(buf, end, p, str_spec); buf = string_nocheck(buf, end, "\"", str_spec); has_mult = true; } break; default: break; } } return widen_string(buf, buf - buf_start, end, spec); } static noinline_for_stack char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, struct printf_spec spec, const char *fmt) { struct printf_spec str_spec = spec; char *buf_start = buf; str_spec.field_width = -1; if (*fmt != 'w') return error_string(buf, end, "(%pf?)", spec); if (check_pointer(&buf, end, fwnode, spec)) return buf; fmt++; switch (*fmt) { case 'P': /* name */ buf = string(buf, end, fwnode_get_name(fwnode), str_spec); break; case 'f': /* full_name */ default: buf = fwnode_full_name_string(fwnode, buf, end); break; } return widen_string(buf, buf - buf_start, end, spec); } static noinline_for_stack char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { if (*fmt == 'r' && fmt[1] == 'a') return range_string(buf, end, ptr, spec, fmt); return resource_string(buf, end, ptr, spec, fmt); } int __init no_hash_pointers_enable(char *str) { if (no_hash_pointers) return 0; no_hash_pointers = true; pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** This system shows unhashed kernel memory addresses **\n"); pr_warn("** via the console, logs, and other interfaces. This **\n"); pr_warn("** might reduce the security of your system. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your system **\n"); pr_warn("** administrator! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); return 0; } early_param("no_hash_pointers", no_hash_pointers_enable); /* Used for Rust formatting ('%pA'). */ char *rust_fmt_argument(char *buf, char *end, void *ptr); /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format * specifiers. * * Please update scripts/checkpatch.pl when adding/removing conversion * characters. (Search for "check for vsprintf extension"). * * Right now we handle: * * - 'S' For symbolic direct pointers (or function descriptors) with offset * - 's' For symbolic direct pointers (or function descriptors) without offset * - '[Ss]R' as above with __builtin_extract_return_addr() translation * - 'S[R]b' as above with module build ID (for use in backtraces) * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of * %ps and %pS. Be careful when re-using these specifiers. * - 'B' For backtraced symbolic direct pointers with offset * - 'Bb' as above with module build ID (for use in backtraces) * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff] * - 'b[l]' For a bitmap, the number of bits is determined by the field * width which must be explicitly specified either as part of the * format string '%32b[l]' or through '%*b[l]', [l] selects * range-list format instead of hex format * - 'M' For a 6-byte MAC address, it prints the address in the * usual colon-separated hex notation * - 'm' For a 6-byte MAC address, it prints the hex address without colons * - 'MF' For a 6-byte MAC FDDI address, it prints the address * with a dash-separated hex notation * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth) * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way * IPv4 uses dot-separated decimal without leading 0's (1.2.3.4) * IPv6 uses colon separated network-order 16 bit hex with leading 0's * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - 'i' [46] for 'raw' IPv4/IPv6 addresses * IPv6 omits the colons (01020304...0f) * IPv4 uses dot-separated decimal with leading 0's (010.123.045.006) * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order * - 'I[6S]c' for IPv6 addresses printed as specified by * https://tools.ietf.org/html/rfc5952 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination * of the following flags (see string_escape_mem() for the * details): * a - ESCAPE_ANY * c - ESCAPE_SPECIAL * h - ESCAPE_HEX * n - ESCAPE_NULL * o - ESCAPE_OCTAL * p - ESCAPE_NP * s - ESCAPE_SPACE * By default ESCAPE_ANY_NP is used. * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form * "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" * Options for %pU are: * b big endian lower case hex (default) * B big endian UPPER case hex * l little endian lower case hex * L little endian UPPER case hex * big endian output byte order is: * [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15] * little endian output byte order is: * [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15] * - 'V' For a struct va_format which contains a format string * and va_list *, * call vsnprintf(->format, *->va_list). * Implements a "recursive vsnprintf". * Do not use this feature without some mechanism to verify the * correctness of the format string and va_list arguments. * - 'K' For a kernel pointer that should be hidden from unprivileged users. * Use only for procfs, sysfs and similar files, not printk(); please * read the documentation (path below) first. * - 'NF' For a netdev_features_t * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value. * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with * a certain separator (' ' by default): * C colon * D dash * N no separator * The maximum supported length is 64 bytes of the input. Consider * to use print_hex_dump() for the larger input. * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives * (default assumed to be phys_addr_t, passed by reference) * - 'd[234]' For a dentry name (optionally 2-4 last components) * - 'D[234]' Same as 'd' but for a struct file * - 'g' For block_device name (gendisk + partition number) * - 't[RT][dt][r][s]' For time and date as represented by: * R struct rtc_time * T time64_t * - 'C' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'G' For flags to be printed as a collection of symbolic strings that would * construct the specific value. Supported flags given by option: * p page flags (see struct page) given as pointer to unsigned long * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t * v vma flags (VM_*) given as pointer to unsigned long * - 'OF[fnpPcCF]' For a device tree object * Without any optional arguments prints the full_name * f device node full_name * n device node name * p device node phandle * P device node path spec (name + @unit) * F device node flags * c major compatible string * C full compatible string * - 'fw[fP]' For a firmware node (struct fwnode_handle) pointer * Without an option prints the full name of the node * f full name * P node name, including a possible unit address * - 'x' For printing the address unmodified. Equivalent to "%lx". * Please read the documentation (path below) before using! * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of * bpf_trace_printk() where [ku] prefix specifies either kernel (k) * or user (u) memory to probe, and: * s a string, equivalent to "%s" on direct vsnprintf() use * * ** When making changes please also update: * Documentation/core-api/printk-formats.rst * * Note: The default behaviour (unadorned %p) is to hash the address, * rendering it useful as a unique identifier. * * There is also a '%pA' format specifier, but it is only intended to be used * from Rust code to format core::fmt::Arguments. Do *not* use it from C. * See rust/kernel/print.rs for details. */ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { switch (*fmt) { case 'S': case 's': ptr = dereference_symbol_descriptor(ptr); fallthrough; case 'B': return symbol_string(buf, end, ptr, spec, fmt); case 'R': case 'r': return resource_or_range(fmt, buf, end, ptr, spec); case 'h': return hex_string(buf, end, ptr, spec, fmt); case 'b': switch (fmt[1]) { case 'l': return bitmap_list_string(buf, end, ptr, spec, fmt); default: return bitmap_string(buf, end, ptr, spec, fmt); } case 'M': /* Colon separated: 00:01:02:03:04:05 */ case 'm': /* Contiguous: 000102030405 */ /* [mM]F (FDDI) */ /* [mM]R (Reverse order; Bluetooth) */ return mac_address_string(buf, end, ptr, spec, fmt); case 'I': /* Formatted IP supported * 4: 1.2.3.4 * 6: 0001:0203:...:0708 * 6c: 1::708 or 1::1.2.3.4 */ case 'i': /* Contiguous: * 4: 001.002.003.004 * 6: 000102...0f */ return ip_addr_string(buf, end, ptr, spec, fmt); case 'E': return escaped_string(buf, end, ptr, spec, fmt); case 'U': return uuid_string(buf, end, ptr, spec, fmt); case 'V': return va_format(buf, end, ptr, spec, fmt); case 'K': return restricted_pointer(buf, end, ptr, spec); case 'N': return netdev_bits(buf, end, ptr, spec, fmt); case '4': return fourcc_string(buf, end, ptr, spec, fmt); case 'a': return address_val(buf, end, ptr, spec, fmt); case 'd': return dentry_name(buf, end, ptr, spec, fmt); case 't': return time_and_date(buf, end, ptr, spec, fmt); case 'C': return clock(buf, end, ptr, spec, fmt); case 'D': return file_dentry_name(buf, end, ptr, spec, fmt); #ifdef CONFIG_BLOCK case 'g': return bdev_name(buf, end, ptr, spec, fmt); #endif case 'G': return flags_string(buf, end, ptr, spec, fmt); case 'O': return device_node_string(buf, end, ptr, spec, fmt + 1); case 'f': return fwnode_string(buf, end, ptr, spec, fmt + 1); case 'A': if (!IS_ENABLED(CONFIG_RUST)) { WARN_ONCE(1, "Please remove %%pA from non-Rust code\n"); return error_string(buf, end, "(%pA?)", spec); } return rust_fmt_argument(buf, end, ptr); case 'x': return pointer_string(buf, end, ptr, spec); case 'e': /* %pe with a non-ERR_PTR gets treated as plain %p */ if (!IS_ERR(ptr)) return default_pointer(buf, end, ptr, spec); return err_ptr(buf, end, ptr, spec); case 'u': case 'k': switch (fmt[1]) { case 's': return string(buf, end, ptr, spec); default: return error_string(buf, end, "(einval)", spec); } default: return default_pointer(buf, end, ptr, spec); } } /* * Helper function to decode printf style format. * Each call decode a token from the format and return the * number of characters read (or likely the delta where it wants * to go on the next call). * The decoded token is returned through the parameters * * 'h', 'l', or 'L' for integer fields * 'z' support added 23/7/1999 S.H. * 'z' changed to 'Z' --davidm 1/25/99 * 'Z' changed to 'z' --adobriyan 2017-01-25 * 't' added for ptrdiff_t * * @fmt: the format string * @type of the token returned * @flags: various flags such as +, -, # tokens.. * @field_width: overwritten width * @base: base of the number (octal, hex, ...) * @precision: precision of a number * @qualifier: qualifier of a number (long, size_t, ...) */ static noinline_for_stack int format_decode(const char *fmt, struct printf_spec *spec) { const char *start = fmt; char qualifier; /* we finished early by reading the field width */ if (spec->type == FORMAT_TYPE_WIDTH) { if (spec->field_width < 0) { spec->field_width = -spec->field_width; spec->flags |= LEFT; } spec->type = FORMAT_TYPE_NONE; goto precision; } /* we finished early by reading the precision */ if (spec->type == FORMAT_TYPE_PRECISION) { if (spec->precision < 0) spec->precision = 0; spec->type = FORMAT_TYPE_NONE; goto qualifier; } /* By default */ spec->type = FORMAT_TYPE_NONE; for (; *fmt ; ++fmt) { if (*fmt == '%') break; } /* Return the current non-format string */ if (fmt != start || !*fmt) return fmt - start; /* Process flags */ spec->flags = 0; while (1) { /* this also skips first '%' */ bool found = true; ++fmt; switch (*fmt) { case '-': spec->flags |= LEFT; break; case '+': spec->flags |= PLUS; break; case ' ': spec->flags |= SPACE; break; case '#': spec->flags |= SPECIAL; break; case '0': spec->flags |= ZEROPAD; break; default: found = false; } if (!found) break; } /* get field width */ spec->field_width = -1; if (isdigit(*fmt)) spec->field_width = skip_atoi(&fmt); else if (*fmt == '*') { /* it's the next argument */ spec->type = FORMAT_TYPE_WIDTH; return ++fmt - start; } precision: /* get the precision */ spec->precision = -1; if (*fmt == '.') { ++fmt; if (isdigit(*fmt)) { spec->precision = skip_atoi(&fmt); if (spec->precision < 0) spec->precision = 0; } else if (*fmt == '*') { /* it's the next argument */ spec->type = FORMAT_TYPE_PRECISION; return ++fmt - start; } } qualifier: /* get the conversion qualifier */ qualifier = 0; if (*fmt == 'h' || _tolower(*fmt) == 'l' || *fmt == 'z' || *fmt == 't') { qualifier = *fmt++; if (unlikely(qualifier == *fmt)) { if (qualifier == 'l') { qualifier = 'L'; ++fmt; } else if (qualifier == 'h') { qualifier = 'H'; ++fmt; } } } /* default base */ spec->base = 10; switch (*fmt) { case 'c': spec->type = FORMAT_TYPE_CHAR; return ++fmt - start; case 's': spec->type = FORMAT_TYPE_STR; return ++fmt - start; case 'p': spec->type = FORMAT_TYPE_PTR; return ++fmt - start; case '%': spec->type = FORMAT_TYPE_PERCENT_CHAR; return ++fmt - start; /* integer number formats - set up the flags and "break" */ case 'o': spec->base = 8; break; case 'x': spec->flags |= SMALL; fallthrough; case 'X': spec->base = 16; break; case 'd': case 'i': spec->flags |= SIGN; break; case 'u': break; case 'n': /* * Since %n poses a greater security risk than * utility, treat it as any other invalid or * unsupported format specifier. */ fallthrough; default: WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt); spec->type = FORMAT_TYPE_INVALID; return fmt - start; } if (qualifier == 'L') spec->type = FORMAT_TYPE_LONG_LONG; else if (qualifier == 'l') { BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG); spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN); } else if (qualifier == 'z') { spec->type = FORMAT_TYPE_SIZE_T; } else if (qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; } else if (qualifier == 'H') { BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE); spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN); } else if (qualifier == 'h') { BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT); spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN); } else { BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT); spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN); } return ++fmt - start; } static void set_field_width(struct printf_spec *spec, int width) { spec->field_width = width; if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) { spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX); } } static void set_precision(struct printf_spec *spec, int prec) { spec->precision = prec; if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) { spec->precision = clamp(prec, 0, PRECISION_MAX); } } /** * vsnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * This function generally follows C99 vsnprintf, but has some * extensions and a few limitations: * * - ``%n`` is unsupported * - ``%p*`` is handled by pointer() * * See pointer() or Documentation/core-api/printk-formats.rst for more * extensive description. * * **Please update the documentation in both places when making changes** * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. * * If you're not already dealing with a va_list consider using snprintf(). */ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) { unsigned long long num; char *str, *end; struct printf_spec spec = {0}; /* Reject out-of-range values early. Large positive sizes are used for unknown buffer sizes. */ if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt) { const char *old_fmt = fmt; int read = format_decode(fmt, &spec); fmt += read; switch (spec.type) { case FORMAT_TYPE_NONE: { int copy = read; if (str < end) { if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; break; } case FORMAT_TYPE_WIDTH: set_field_width(&spec, va_arg(args, int)); break; case FORMAT_TYPE_PRECISION: set_precision(&spec, va_arg(args, int)); break; case FORMAT_TYPE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) va_arg(args, int); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } break; } case FORMAT_TYPE_STR: str = string(str, end, va_arg(args, char *), spec); break; case FORMAT_TYPE_PTR: str = pointer(fmt, str, end, va_arg(args, void *), spec); while (isalnum(*fmt)) fmt++; break; case FORMAT_TYPE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; break; case FORMAT_TYPE_INVALID: /* * Presumably the arguments passed gcc's type * checking, but there is no safe or sane way * for us to continue parsing the format and * fetching from the va_list; the remaining * specifiers and arguments would be out of * sync. */ goto out; default: switch (spec.type) { case FORMAT_TYPE_LONG_LONG: num = va_arg(args, long long); break; case FORMAT_TYPE_ULONG: num = va_arg(args, unsigned long); break; case FORMAT_TYPE_LONG: num = va_arg(args, long); break; case FORMAT_TYPE_SIZE_T: if (spec.flags & SIGN) num = va_arg(args, ssize_t); else num = va_arg(args, size_t); break; case FORMAT_TYPE_PTRDIFF: num = va_arg(args, ptrdiff_t); break; case FORMAT_TYPE_UBYTE: num = (unsigned char) va_arg(args, int); break; case FORMAT_TYPE_BYTE: num = (signed char) va_arg(args, int); break; case FORMAT_TYPE_USHORT: num = (unsigned short) va_arg(args, int); break; case FORMAT_TYPE_SHORT: num = (short) va_arg(args, int); break; case FORMAT_TYPE_INT: num = (int) va_arg(args, int); break; default: num = va_arg(args, unsigned int); } str = number(str, end, num, spec); } } out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } /* the trailing null byte doesn't count towards the total */ return str-buf; } EXPORT_SYMBOL(vsnprintf); /** * vscnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * The return value is the number of characters which have been written into * the @buf not including the trailing '\0'. If @size is == 0 the function * returns 0. * * If you're not already dealing with a va_list consider using scnprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; if (unlikely(!size)) return 0; i = vsnprintf(buf, size, fmt, args); if (likely(i < size)) return i; return size - 1; } EXPORT_SYMBOL(vscnprintf); /** * snprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters which would be * generated for the given input, excluding the trailing null, * as per ISO C99. If the return is greater than or equal to * @size, the resulting string is truncated. * * See the vsnprintf() documentation for format string extensions over C99. */ int snprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(snprintf); /** * scnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including * the trailing '\0'. If @size is == 0 the function returns 0. */ int scnprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vscnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(scnprintf); /** * vsprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @args: Arguments for the format string * * The function returns the number of characters written * into @buf. Use vsnprintf() or vscnprintf() in order to avoid * buffer overflows. * * If you're not already dealing with a va_list consider using sprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vsprintf(char *buf, const char *fmt, va_list args) { return vsnprintf(buf, INT_MAX, fmt, args); } EXPORT_SYMBOL(vsprintf); /** * sprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @...: Arguments for the format string * * The function returns the number of characters written * into @buf. Use snprintf() or scnprintf() in order to avoid * buffer overflows. * * See the vsnprintf() documentation for format string extensions over C99. */ int sprintf(char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, INT_MAX, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sprintf); #ifdef CONFIG_BINARY_PRINTF /* * bprintf service: * vbin_printf() - VA arguments to binary data * bstr_printf() - Binary data to text string */ /** * vbin_printf - Parse a format string and place args' binary value in a buffer * @bin_buf: The buffer to place args' binary value * @size: The size of the buffer(by words(32bits), not characters) * @fmt: The format string to use * @args: Arguments for the format string * * The format follows C99 vsnprintf, except %n is ignored, and its argument * is skipped. * * The return value is the number of words(32bits) which would be generated for * the given input. * * NOTE: * If the return value is greater than @size, the resulting bin_buf is NOT * valid for bstr_printf(). */ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) { struct printf_spec spec = {0}; char *str, *end; int width; str = (char *)bin_buf; end = (char *)(bin_buf + size); #define save_arg(type) \ ({ \ unsigned long long value; \ if (sizeof(type) == 8) { \ unsigned long long val8; \ str = PTR_ALIGN(str, sizeof(u32)); \ val8 = va_arg(args, unsigned long long); \ if (str + sizeof(type) <= end) { \ *(u32 *)str = *(u32 *)&val8; \ *(u32 *)(str + 4) = *((u32 *)&val8 + 1); \ } \ value = val8; \ } else { \ unsigned int val4; \ str = PTR_ALIGN(str, sizeof(type)); \ val4 = va_arg(args, int); \ if (str + sizeof(type) <= end) \ *(typeof(type) *)str = (type)(long)val4; \ value = (unsigned long long)val4; \ } \ str += sizeof(type); \ value; \ }) while (*fmt) { int read = format_decode(fmt, &spec); fmt += read; switch (spec.type) { case FORMAT_TYPE_NONE: case FORMAT_TYPE_PERCENT_CHAR: break; case FORMAT_TYPE_INVALID: goto out; case FORMAT_TYPE_WIDTH: case FORMAT_TYPE_PRECISION: width = (int)save_arg(int); /* Pointers may require the width */ if (*fmt == 'p') set_field_width(&spec, width); break; case FORMAT_TYPE_CHAR: save_arg(char); break; case FORMAT_TYPE_STR: { const char *save_str = va_arg(args, char *); const char *err_msg; size_t len; err_msg = check_pointer_msg(save_str); if (err_msg) save_str = err_msg; len = strlen(save_str) + 1; if (str + len < end) memcpy(str, save_str, len); str += len; break; } case FORMAT_TYPE_PTR: /* Dereferenced pointers must be done now */ switch (*fmt) { /* Dereference of functions is still OK */ case 'S': case 's': case 'x': case 'K': case 'e': save_arg(void *); break; default: if (!isalnum(*fmt)) { save_arg(void *); break; } str = pointer(fmt, str, end, va_arg(args, void *), spec); if (str + 1 < end) *str++ = '\0'; else end[-1] = '\0'; /* Must be nul terminated */ } /* skip all alphanumeric pointer suffixes */ while (isalnum(*fmt)) fmt++; break; default: switch (spec.type) { case FORMAT_TYPE_LONG_LONG: save_arg(long long); break; case FORMAT_TYPE_ULONG: case FORMAT_TYPE_LONG: save_arg(unsigned long); break; case FORMAT_TYPE_SIZE_T: save_arg(size_t); break; case FORMAT_TYPE_PTRDIFF: save_arg(ptrdiff_t); break; case FORMAT_TYPE_UBYTE: case FORMAT_TYPE_BYTE: save_arg(char); break; case FORMAT_TYPE_USHORT: case FORMAT_TYPE_SHORT: save_arg(short); break; default: save_arg(int); } } } out: return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; #undef save_arg } EXPORT_SYMBOL_GPL(vbin_printf); /** * bstr_printf - Format a string from binary arguments and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @bin_buf: Binary arguments for the format string * * This function like C99 vsnprintf, but the difference is that vsnprintf gets * arguments from stack, and bstr_printf gets arguments from @bin_buf which is * a binary buffer that generated by vbin_printf. * * The format follows C99 vsnprintf, but has some extensions: * see vsnprintf comment for details. * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. */ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) { struct printf_spec spec = {0}; char *str, *end; const char *args = (const char *)bin_buf; if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; #define get_arg(type) \ ({ \ typeof(type) value; \ if (sizeof(type) == 8) { \ args = PTR_ALIGN(args, sizeof(u32)); \ *(u32 *)&value = *(u32 *)args; \ *((u32 *)&value + 1) = *(u32 *)(args + 4); \ } else { \ args = PTR_ALIGN(args, sizeof(type)); \ value = *(typeof(type) *)args; \ } \ args += sizeof(type); \ value; \ }) /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt) { const char *old_fmt = fmt; int read = format_decode(fmt, &spec); fmt += read; switch (spec.type) { case FORMAT_TYPE_NONE: { int copy = read; if (str < end) { if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; break; } case FORMAT_TYPE_WIDTH: set_field_width(&spec, get_arg(int)); break; case FORMAT_TYPE_PRECISION: set_precision(&spec, get_arg(int)); break; case FORMAT_TYPE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) get_arg(char); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } break; } case FORMAT_TYPE_STR: { const char *str_arg = args; args += strlen(str_arg) + 1; str = string(str, end, (char *)str_arg, spec); break; } case FORMAT_TYPE_PTR: { bool process = false; int copy, len; /* Non function dereferences were already done */ switch (*fmt) { case 'S': case 's': case 'x': case 'K': case 'e': process = true; break; default: if (!isalnum(*fmt)) { process = true; break; } /* Pointer dereference was already processed */ if (str < end) { len = copy = strlen(args); if (copy > end - str) copy = end - str; memcpy(str, args, copy); str += len; args += len + 1; } } if (process) str = pointer(fmt, str, end, get_arg(void *), spec); while (isalnum(*fmt)) fmt++; break; } case FORMAT_TYPE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; break; case FORMAT_TYPE_INVALID: goto out; default: { unsigned long long num; switch (spec.type) { case FORMAT_TYPE_LONG_LONG: num = get_arg(long long); break; case FORMAT_TYPE_ULONG: case FORMAT_TYPE_LONG: num = get_arg(unsigned long); break; case FORMAT_TYPE_SIZE_T: num = get_arg(size_t); break; case FORMAT_TYPE_PTRDIFF: num = get_arg(ptrdiff_t); break; case FORMAT_TYPE_UBYTE: num = get_arg(unsigned char); break; case FORMAT_TYPE_BYTE: num = get_arg(signed char); break; case FORMAT_TYPE_USHORT: num = get_arg(unsigned short); break; case FORMAT_TYPE_SHORT: num = get_arg(short); break; case FORMAT_TYPE_UINT: num = get_arg(unsigned int); break; default: num = get_arg(int); } str = number(str, end, num, spec); } /* default: */ } /* switch(spec.type) */ } /* while(*fmt) */ out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } #undef get_arg /* the trailing null byte doesn't count towards the total */ return str - buf; } EXPORT_SYMBOL_GPL(bstr_printf); #endif /* CONFIG_BINARY_PRINTF */ /** * vsscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: format of buffer * @args: arguments */ int vsscanf(const char *buf, const char *fmt, va_list args) { const char *str = buf; char *next; char digit; int num = 0; u8 qualifier; unsigned int base; union { long long s; unsigned long long u; } val; s16 field_width; bool is_sign; while (*fmt) { /* skip any white space in format */ /* white space in format matches any amount of * white space, including none, in the input. */ if (isspace(*fmt)) { fmt = skip_spaces(++fmt); str = skip_spaces(str); } /* anything that is not a conversion must match exactly */ if (*fmt != '%' && *fmt) { if (*fmt++ != *str++) break; continue; } if (!*fmt) break; ++fmt; /* skip this conversion. * advance both strings to next white space */ if (*fmt == '*') { if (!*str) break; while (!isspace(*fmt) && *fmt != '%' && *fmt) { /* '%*[' not yet supported, invalid format */ if (*fmt == '[') return num; fmt++; } while (!isspace(*str) && *str) str++; continue; } /* get field width */ field_width = -1; if (isdigit(*fmt)) { field_width = skip_atoi(&fmt); if (field_width <= 0) break; } /* get conversion qualifier */ qualifier = -1; if (*fmt == 'h' || _tolower(*fmt) == 'l' || *fmt == 'z') { qualifier = *fmt++; if (unlikely(qualifier == *fmt)) { if (qualifier == 'h') { qualifier = 'H'; fmt++; } else if (qualifier == 'l') { qualifier = 'L'; fmt++; } } } if (!*fmt) break; if (*fmt == 'n') { /* return number of characters read so far */ *va_arg(args, int *) = str - buf; ++fmt; continue; } if (!*str) break; base = 10; is_sign = false; switch (*fmt++) { case 'c': { char *s = (char *)va_arg(args, char*); if (field_width == -1) field_width = 1; do { *s++ = *str++; } while (--field_width > 0 && *str); num++; } continue; case 's': { char *s = (char *)va_arg(args, char *); if (field_width == -1) field_width = SHRT_MAX; /* first, skip leading white space in buffer */ str = skip_spaces(str); /* now copy until next white space */ while (*str && !isspace(*str) && field_width--) *s++ = *str++; *s = '\0'; num++; } continue; /* * Warning: This implementation of the '[' conversion specifier * deviates from its glibc counterpart in the following ways: * (1) It does NOT support ranges i.e. '-' is NOT a special * character * (2) It cannot match the closing bracket ']' itself * (3) A field width is required * (4) '%*[' (discard matching input) is currently not supported * * Example usage: * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]", * buf1, buf2, buf3); * if (ret < 3) * // etc.. */ case '[': { char *s = (char *)va_arg(args, char *); DECLARE_BITMAP(set, 256) = {0}; unsigned int len = 0; bool negate = (*fmt == '^'); /* field width is required */ if (field_width == -1) return num; if (negate) ++fmt; for ( ; *fmt && *fmt != ']'; ++fmt, ++len) __set_bit((u8)*fmt, set); /* no ']' or no character set found */ if (!*fmt || !len) return num; ++fmt; if (negate) { bitmap_complement(set, set, 256); /* exclude null '\0' byte */ __clear_bit(0, set); } /* match must be non-empty */ if (!test_bit((u8)*str, set)) return num; while (test_bit((u8)*str, set) && field_width--) *s++ = *str++; *s = '\0'; ++num; } continue; case 'o': base = 8; break; case 'x': case 'X': base = 16; break; case 'i': base = 0; fallthrough; case 'd': is_sign = true; fallthrough; case 'u': break; case '%': /* looking for '%' in str */ if (*str++ != '%') return num; continue; default: /* invalid format; stop here */ return num; } /* have some sort of integer conversion. * first, skip white space in buffer. */ str = skip_spaces(str); digit = *str; if (is_sign && digit == '-') { if (field_width == 1) break; digit = *(str + 1); } if (!digit || (base == 16 && !isxdigit(digit)) || (base == 10 && !isdigit(digit)) || (base == 8 && !isodigit(digit)) || (base == 0 && !isdigit(digit))) break; if (is_sign) val.s = simple_strntoll(str, &next, base, field_width >= 0 ? field_width : INT_MAX); else val.u = simple_strntoull(str, &next, base, field_width >= 0 ? field_width : INT_MAX); switch (qualifier) { case 'H': /* that's 'hh' in format */ if (is_sign) *va_arg(args, signed char *) = val.s; else *va_arg(args, unsigned char *) = val.u; break; case 'h': if (is_sign) *va_arg(args, short *) = val.s; else *va_arg(args, unsigned short *) = val.u; break; case 'l': if (is_sign) *va_arg(args, long *) = val.s; else *va_arg(args, unsigned long *) = val.u; break; case 'L': if (is_sign) *va_arg(args, long long *) = val.s; else *va_arg(args, unsigned long long *) = val.u; break; case 'z': *va_arg(args, size_t *) = val.u; break; default: if (is_sign) *va_arg(args, int *) = val.s; else *va_arg(args, unsigned int *) = val.u; break; } num++; if (!next) break; str = next; } return num; } EXPORT_SYMBOL(vsscanf); /** * sscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: formatting of buffer * @...: resulting arguments */ int sscanf(const char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsscanf(buf, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sscanf);
313 8 7 7 1 7 4 6 6 6 7 21 21 327 32 327 327 32 48 43 71 71 32 32 17 32 32 32 32 71 71 71 7 66 45 27 45 45 45 45 71 71 66 71 39 27 71 27 27 27 71 71 71 66 40 28 28 28 28 26 244 278 278 278 17 17 17 13 16 2 8 8 8 8 2 6 17 17 32 32 32 32 32 32 258 258 234 256 258 23 234 24 24 24 233 257 257 206 207 207 207 114 114 126 125 114 28 28 28 28 28 28 257 314 3 3 3 3 197 32 2 228 32 197 19 19 16 7 7 19 7 6 4 4 8 8 7 313 313 313 313 313 257 108 313 18 91 91 257 310 313 2 4324 4334 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/shmem_fs.h> #include <linux/blk-cgroup.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> #include <linux/suspend.h> #include <linux/zswap.h> #include <linux/plist.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include "internal.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); static struct swap_cluster_info *lock_cluster_or_swap_info( struct swap_info_struct *si, unsigned long offset); static void unlock_cluster_or_swap_info(struct swap_info_struct *si, struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available. */ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority = -1; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; #endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; /* * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); static struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page */ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 /* Reclaim directly, bypass the slot cache and don't touch device lock */ #define TTRS_DIRECT 0x8 static bool swap_is_has_cache(struct swap_info_struct *si, unsigned long offset, int nr_pages) { unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; do { VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); if (*map != SWAP_HAS_CACHE) return false; } while (++map < map_end); return true; } static bool swap_is_last_map(struct swap_info_struct *si, unsigned long offset, int nr_pages, bool *has_cache) { unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; unsigned char count = *map; if (swap_count(count) != 1) return false; while (++map < map_end) { if (*map != count) return false; } *has_cache = !!(count & SWAP_HAS_CACHE); return true; } /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no * folio was associated with the swap entry. */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct address_space *address_space = swap_address_space(entry); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; nr_pages = folio_nr_pages(folio); ret = -nr_pages; /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ if (!folio_trylock(folio)) goto out; /* offset could point to the middle of a large folio */ entry = folio->swap; offset = swp_offset(entry); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); if (!need_reclaim || !folio_swapcache_freeable(folio)) goto out_unlock; /* * It's safe to delete the folio from swap cache only if the folio's * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ ci = lock_cluster_or_swap_info(si, offset); need_reclaim = swap_is_has_cache(si, offset, nr_pages); unlock_cluster_or_swap_info(si, ci); if (!need_reclaim) goto out_unlock; if (!(flags & TTRS_DIRECT)) { /* Free through slot cache */ delete_from_swap_cache(folio); folio_set_dirty(folio); ret = nr_pages; goto out_unlock; } xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); folio_ref_sub(folio, nr_pages); folio_set_dirty(folio); spin_lock(&si->lock); /* Only sinple page folio can be backed by zswap */ if (nr_pages == 1) zswap_invalidate(entry); swap_entry_range_free(si, entry, nr_pages); spin_unlock(&si->lock); ret = nr_pages; out_unlock: folio_unlock(folio); out: folio_put(folio); return ret; } static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); return rb_entry(rb, struct swap_extent, rb_node); } static inline struct swap_extent *next_se(struct swap_extent *se) { struct rb_node *rb = rb_next(&se->rb_node); return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; } /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; sector_t start_block; sector_t nr_blocks; int err = 0; /* Do not discard the swap header page! */ se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); } for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */ } static struct swap_extent * offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) { struct swap_extent *se; struct rb_node *rb; rb = sis->swap_extent_root.rb_node; while (rb) { se = rb_entry(rb, struct swap_extent, rb_node); if (offset < se->start_page) rb = rb->rb_left; else if (offset >= se->start_page + se->nr_pages) rb = rb->rb_right; else return se; } /* It *must* be present */ BUG(); } sector_t swap_folio_sector(struct folio *folio) { struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; offset = swp_offset(folio->swap); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); } /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */ static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO)) break; se = next_se(se); } } #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 /* * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ #define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256 static inline bool cluster_is_free(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_FREE; } static inline unsigned int cluster_index(struct swap_info_struct *si, struct swap_cluster_info *ci) { return ci - si->cluster_info; } static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; ci = si->cluster_info; if (ci) { ci += offset / SWAPFILE_CLUSTER; spin_lock(&ci->lock); } return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { if (ci) spin_unlock(&ci->lock); } /* * Determine the locking method in use for this device. Return * swap_cluster_info if SSD-style cluster-based locking is in place. */ static inline struct swap_cluster_info *lock_cluster_or_swap_info( struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; /* Try to use fine-grained SSD-style locking if available: */ ci = lock_cluster(si, offset); /* Otherwise, fall back to traditional, coarse locking: */ if (!ci) spin_lock(&si->lock); return ci; } static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, struct swap_cluster_info *ci) { if (ci) unlock_cluster(ci); else spin_unlock(&si->lock); } /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, struct swap_cluster_info *ci) { unsigned int idx = cluster_index(si, ci); /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). * It will be cleared after discard */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); list_move_tail(&ci->list, &si->discard_clusters); ci->flags = 0; schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); if (ci->flags) list_move_tail(&ci->list, &si->free_clusters); else list_add_tail(&ci->list, &si->free_clusters); ci->flags = CLUSTER_FLAG_FREE; ci->order = 0; } /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; unsigned int idx; while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&si->lock); spin_lock(&ci->lock); __free_cluster(si, ci); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); spin_unlock(&ci->lock); } } static void swap_discard_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, discard_work); spin_lock(&si->lock); swap_do_scheduled_discard(si); spin_unlock(&si->lock); } static void swap_users_ref_free(struct percpu_ref *ref) { struct swap_info_struct *si; si = container_of(ref, struct swap_info_struct, users); complete(&si->comp); } static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); if (ci->flags & CLUSTER_FLAG_FRAG) si->frag_cluster_nr[ci->order]--; /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard. */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(si, ci); return; } __free_cluster(si, ci); } /* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ static void inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; if (!cluster_info) return; ci = cluster_info + idx; ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); } /* * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, * which means no page in the cluster is in use, we can optionally discard * the cluster and add it to free cluster list. */ static void dec_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *ci, int nr_pages) { if (!si->cluster_info) return; VM_BUG_ON(ci->count < nr_pages); VM_BUG_ON(cluster_is_free(ci)); lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); ci->count -= nr_pages; if (!ci->count) { free_cluster(si, ci); return; } if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); if (ci->flags & CLUSTER_FLAG_FRAG) si->frag_cluster_nr[ci->order]--; list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); ci->flags = CLUSTER_FLAG_NONFULL; } } static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; unsigned long offset; spin_unlock(&ci->lock); spin_unlock(&si->lock); for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { case 0: continue; case SWAP_HAS_CACHE: if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) continue; goto out; default: goto out; } } out: spin_lock(&si->lock); spin_lock(&ci->lock); /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. */ for (offset = start; offset < end; offset++) if (READ_ONCE(map[offset])) return false; return true; } static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned int nr_pages) { unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; bool need_reclaim = false; for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { case 0: continue; case SWAP_HAS_CACHE: if (!vm_swap_full()) return false; need_reclaim = true; continue; default: return false; } } if (need_reclaim) return cluster_reclaim_range(si, ci, start, end); return true; } static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) { unsigned int nr_pages = 1 << order; if (!(si->flags & SWP_WRITEOK)) return false; if (cluster_is_free(ci)) { if (nr_pages < SWAPFILE_CLUSTER) { list_move_tail(&ci->list, &si->nonfull_clusters[order]); ci->flags = CLUSTER_FLAG_NONFULL; } ci->order = order; } memset(si->swap_map + start, usage, nr_pages); swap_range_alloc(si, start, nr_pages); ci->count += nr_pages; if (ci->count == SWAPFILE_CLUSTER) { VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); if (ci->flags & CLUSTER_FLAG_FRAG) si->frag_cluster_nr[ci->order]--; list_move_tail(&ci->list, &si->full_clusters); ci->flags = CLUSTER_FLAG_FULL; } return true; } static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, unsigned int *foundp, unsigned int order, unsigned char usage) { unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int nr_pages = 1 << order; struct swap_cluster_info *ci; if (end < nr_pages) return SWAP_NEXT_INVALID; end -= nr_pages; ci = lock_cluster(si, offset); if (ci->count + nr_pages > SWAPFILE_CLUSTER) { offset = SWAP_NEXT_INVALID; goto done; } while (offset <= end) { if (cluster_scan_range(si, ci, offset, nr_pages)) { if (!cluster_alloc_range(si, ci, offset, usage, order)) { offset = SWAP_NEXT_INVALID; goto done; } *foundp = offset; if (ci->count == SWAPFILE_CLUSTER) { offset = SWAP_NEXT_INVALID; goto done; } offset += nr_pages; break; } offset += nr_pages; } if (offset > end) offset = SWAP_NEXT_INVALID; done: unlock_cluster(ci); return offset; } /* Return true if reclaimed a whole cluster */ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; unsigned char *map = si->swap_map; int nr_reclaim; if (force) to_scan = si->inuse_pages / SWAPFILE_CLUSTER; while (!list_empty(&si->full_clusters)) { ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); list_move_tail(&ci->list, &si->full_clusters); offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); if (nr_reclaim) { offset += abs(nr_reclaim); continue; } } offset++; } spin_lock(&si->lock); if (to_scan <= 0) break; } } static void swap_reclaim_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, reclaim_work); spin_lock(&si->lock); swap_reclaim_full_clusters(si, true); spin_unlock(&si->lock); } /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int offset, found = 0; new_cluster: lockdep_assert_held(&si->lock); cluster = this_cpu_ptr(si->percpu_cluster); offset = cluster->next[order]; if (offset) { offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); if (found) goto done; } if (!list_empty(&si->free_clusters)) { ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); /* * Either we didn't touch the cluster due to swapoff, * or the allocation must success. */ VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); goto done; } /* Try reclaim from full clusters if free clusters list is drained */ if (vm_swap_full()) swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { unsigned int frags = 0; while (!list_empty(&si->nonfull_clusters[order])) { ci = list_first_entry(&si->nonfull_clusters[order], struct swap_cluster_info, list); list_move_tail(&ci->list, &si->frag_clusters[order]); ci->flags = CLUSTER_FLAG_FRAG; si->frag_cluster_nr[order]++; offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); frags++; if (found) break; } if (!found) { /* * Nonfull clusters are moved to frag tail if we reached * here, count them too, don't over scan the frag list. */ while (frags < si->frag_cluster_nr[order]) { ci = list_first_entry(&si->frag_clusters[order], struct swap_cluster_info, list); /* * Rotate the frag list to iterate, they were all failing * high order allocation or moved here due to per-CPU usage, * this help keeping usable cluster ahead. */ list_move_tail(&ci->list, &si->frag_clusters[order]); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); frags++; if (found) break; } } } if (found) goto done; if (!list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); goto new_cluster; } if (order) goto done; /* Order 0 stealing from higher order */ for (int o = 1; o < SWAP_NR_ORDERS; o++) { /* * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ while (!list_empty(&si->frag_clusters[o])) { ci = list_first_entry(&si->frag_clusters[o], struct swap_cluster_info, list); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, 0, usage); if (found) goto done; } while (!list_empty(&si->nonfull_clusters[o])) { ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, list); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, 0, usage); if (found) goto done; } } done: cluster->next[order] = offset; return found; } static void __del_from_avail_list(struct swap_info_struct *si) { int nid; assert_spin_locked(&si->lock); for_each_node(nid) plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); } static void del_from_avail_list(struct swap_info_struct *si) { spin_lock(&swap_avail_lock); __del_from_avail_list(si); spin_unlock(&swap_avail_lock); } static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned int end = offset + nr_entries - 1; if (offset == si->lowest_bit) si->lowest_bit += nr_entries; if (end == si->highest_bit) WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; del_from_avail_list(si); if (si->cluster_info && vm_swap_full()) schedule_work(&si->reclaim_work); } } static void add_to_avail_list(struct swap_info_struct *si) { int nid; spin_lock(&swap_avail_lock); for_each_node(nid) plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); spin_unlock(&swap_avail_lock); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; /* * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ for (i = 0; i < nr_entries; i++) clear_bit(offset + i, si->zeromap); if (offset < si->lowest_bit) si->lowest_bit = offset; if (end > si->highest_bit) { bool was_full = !si->highest_bit; WRITE_ONCE(si->highest_bit, end); if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; else swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } clear_shadow_from_swap_cache(si->type, begin, end); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 * only after the above cleanups are done. */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); } static void set_cluster_next(struct swap_info_struct *si, unsigned long next) { unsigned long prev; if (!(si->flags & SWP_SOLIDSTATE)) { si->cluster_next = next; return; } prev = this_cpu_read(*si->cluster_next_cpu); /* * Cross the swap address space size aligned trunk, choose * another trunk randomly to avoid lock contention on swap * address space if possible. */ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != (next >> SWAP_ADDRESS_SPACE_SHIFT)) { /* No free swap slots available */ if (si->highest_bit <= si->lowest_bit) return; next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); next = max_t(unsigned int, next, si->lowest_bit); } this_cpu_write(*si->cluster_next_cpu, next); } static bool swap_offset_available_and_locked(struct swap_info_struct *si, unsigned long offset) { if (data_race(!si->swap_map[offset])) { spin_lock(&si->lock); return true; } if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { spin_lock(&si->lock); return true; } return false; } static int cluster_alloc_swap(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { int n_ret = 0; VM_BUG_ON(!si->cluster_info); si->flags += SWP_SCANNING; while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); if (!offset) break; slots[n_ret++] = swp_entry(si->type, offset); } si->flags -= SWP_SCANNING; return n_ret; } static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; unsigned int nr_pages = 1 << order; int n_ret = 0; bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting * a new cluster. This prevents us from scattering swap pages * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ if (order > 0) { /* * Should not even be attempting large allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP) || nr_pages > SWAPFILE_CLUSTER) { VM_WARN_ON_ONCE(1); return 0; } /* * Swapfile is not block device or not using clusters so unable * to allocate large entries. */ if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) return 0; } if (si->cluster_info) return cluster_alloc_swap(si, usage, nr, slots, order); si->flags += SWP_SCANNING; /* For HDD, sequential access is more important. */ scan_base = si->cluster_next; offset = scan_base; if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } spin_unlock(&si->lock); /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. */ scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; } checks: if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed > 0) goto checks; goto scan; /* check next one */ } if (si->swap_map[offset]) { if (!n_ret) goto scan; else goto done; } memset(si->swap_map + offset, usage, nr_pages); swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ if ((n_ret == nr) || (offset >= si->highest_bit)) goto done; /* search for next available slot */ /* time to take a break? */ if (unlikely(--latency_ration < 0)) { if (n_ret) goto done; spin_unlock(&si->lock); cond_resched(); spin_lock(&si->lock); latency_ration = LATENCY_LIMIT; } if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } /* * Even if there's no free clusters available (fragmented), * try to scan a little more quickly with lock held unless we * have scanned too many slots already. */ if (!scanned_many) { unsigned long scan_limit; if (offset < scan_base) scan_limit = scan_base; else scan_limit = si->highest_bit; for (; offset <= scan_limit && --latency_ration > 0; offset++) { if (!si->swap_map[offset]) goto checks; } } done: if (order == 0) set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; scan: VM_WARN_ON(order > 0); spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; } offset = si->lowest_bit; while (offset < scan_base) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; offset++; } spin_lock(&si->lock); no_page: si->flags -= SWP_SCANNING; return n_ret; } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { int order = swap_entry_order(entry_order); unsigned long size = 1 << order; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; } n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { spin_unlock(&si->lock); goto nextsi; } WARN(!si->highest_bit, "swap_info %d in list but !highest_bit\n", si->type); WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK\n", si->type); __del_from_avail_list(si); spin_unlock(&si->lock); goto nextsi; } n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries, order); spin_unlock(&si->lock); if (n_ret || size > 1) goto check_out; cond_resched(); spin_lock(&swap_avail_lock); nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock and taking * si->lock. Since we dropped the swap_avail_lock, the * swap_avail_head list may have been modified; so if next is * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ if (plist_node_empty(&next->avail_lists[node])) goto start_over; } spin_unlock(&swap_avail_lock); check_out: if (n_ret < n_goal) atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; } static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); if (offset >= si->max) goto bad_offset; if (data_race(!si->swap_map[swp_offset(entry)])) goto bad_free; return si; bad_free: pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); goto out; bad_offset: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; bad_device: pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); goto out; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; } static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { struct swap_info_struct *p; p = _swap_info_get(entry); if (p != q) { if (q != NULL) spin_unlock(&q->lock); if (p != NULL) spin_lock(&p->lock); } return p; } static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache; count = si->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; } else if (count == SWAP_MAP_SHMEM) { /* * Or we could insist on shmem.c using a special * swap_shmem_free() and free_shmem_swap_and_cache()... */ count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; } else count--; } usage = count | has_cache; if (usage) WRITE_ONCE(si->swap_map[offset], usage); else WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); return usage; } /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU * reader side is locked, etc., the swap entry may become invalid * because of swapoff. Then, we need to enclose all swap related * functions with get_swap_device() and put_swap_device(), unless the * swap functions call get/put_swap_device() by themselves. * * RCU reader side lock (including any spinlock) is sufficient to * prevent swapoff, because synchronize_rcu() is called in swapoff() * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff. The caller must be prepared for that. For * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * __read_swap_cache_async() * swapcache_prepare() * __swap_duplicate() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (!percpu_ref_tryget_live(&si->users)) goto out; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(). */ smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) goto put_out; return si; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; put_out: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } static unsigned char __swap_entry_free(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster_or_swap_info(si, offset); usage = __swap_entry_free_locked(si, offset, 1); unlock_cluster_or_swap_info(si, ci); if (!usage) free_swap_slot(entry); return usage; } static bool __swap_entries_free(struct swap_info_struct *si, swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; int i; if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) goto fallback; /* cross into another cluster */ if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) goto fallback; ci = lock_cluster_or_swap_info(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { unlock_cluster_or_swap_info(si, ci); goto fallback; } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); unlock_cluster_or_swap_info(si, ci); if (!has_cache) { for (i = 0; i < nr; i++) zswap_invalidate(swp_entry(si->type, offset + i)); spin_lock(&si->lock); swap_entry_range_free(si, entry, nr); spin_unlock(&si->lock); } return has_cache; fallback: for (i = 0; i < nr; i++) { if (data_race(si->swap_map[offset + i])) { count = __swap_entry_free(si, swp_entry(type, offset + i)); if (count == SWAP_HAS_CACHE) has_cache = true; } else { WARN_ON_ONCE(1); } } return has_cache; } /* * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. */ static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; struct swap_cluster_info *ci; ci = lock_cluster(si, offset); do { VM_BUG_ON(*map != SWAP_HAS_CACHE); *map = 0; } while (++map < map_end); dec_cluster_info_page(si, ci, nr_pages); unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); } static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned long offset, int nr_pages, unsigned char usage) { struct swap_cluster_info *ci; DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; int i, nr; ci = lock_cluster_or_swap_info(si, offset); while (nr_pages) { nr = min(BITS_PER_LONG, nr_pages); for (i = 0; i < nr; i++) { if (!__swap_entry_free_locked(si, offset + i, usage)) bitmap_set(to_free, i, 1); } if (!bitmap_empty(to_free, BITS_PER_LONG)) { unlock_cluster_or_swap_info(si, ci); for_each_set_bit(i, to_free, BITS_PER_LONG) free_swap_slot(swp_entry(si->type, offset + i)); if (nr == nr_pages) return; bitmap_clear(to_free, 0, BITS_PER_LONG); ci = lock_cluster_or_swap_info(si, offset); } offset += nr; nr_pages -= nr; } unlock_cluster_or_swap_info(si, ci); } /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free_nr(swp_entry_t entry, int nr_pages) { int nr; struct swap_info_struct *sis; unsigned long offset = swp_offset(entry); sis = _swap_info_get(entry); if (!sis) return; while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); cluster_swap_free_nr(sis, offset, nr, 1); offset += nr; nr_pages -= nr; } } /* * Called after dropping swapcache to decrease refcnt to swap entries. */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); if (!si) return; ci = lock_cluster_or_swap_info(si, offset); if (size > 1 && swap_is_has_cache(si, offset, size)) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); swap_entry_range_free(si, entry, size); spin_unlock(&si->lock); return; } for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); if (i == size - 1) return; lock_cluster_or_swap_info(si, offset); } } unlock_cluster_or_swap_info(si, ci); } static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; return (int)swp_type(*e1) - (int)swp_type(*e2); } void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; int i; if (n <= 0) return; prev = NULL; p = NULL; /* * Sort swap entries by swap device, so each lock is only taken once. * nr_swapfiles isn't absolutely correct, but the overhead of sort() is * so low that it isn't necessary to optimize further. */ if (nr_swapfiles > 1) sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) swap_entry_range_free(p, entries[i], 1); prev = p; } if (p) spin_unlock(&p->lock); } int __swap_count(swp_entry_t entry) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); } /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); unlock_cluster_or_swap_info(si, ci); return count; } /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; si = _swap_info_get(entry); if (!si) return 0; offset = swp_offset(entry); ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; page = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { page = list_next_entry(page, lru); map = kmap_local_page(page); tmp_count = map[offset]; kunmap_local(map); count += (tmp_count & ~COUNT_CONTINUED) * n; n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: unlock_cluster_or_swap_info(si, ci); return count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry, int order) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, nr_pages); int i; bool ret = false; ci = lock_cluster_or_swap_info(si, offset); if (!ci || nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) { ret = true; break; } } unlock_out: unlock_cluster_or_swap_info(si, ci); return ret; } static bool folio_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; struct swap_info_struct *si = _swap_info_get(entry); if (!si) return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) return false; if (folio_test_writeback(folio)) return false; /* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a folio which has already been recorded in the * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) return false; return true; } /** * folio_free_swap() - Free the swap space used for this folio. * @folio: The folio to remove. * * If swap is getting full, or if there are no more mappings of this folio, * then call folio_free_swap to free its swap space. * * Return: true if we were able to release the swap space. */ bool folio_free_swap(struct folio *folio) { if (!folio_swapcache_freeable(folio)) return false; if (folio_swapped(folio)) return false; delete_from_swap_cache(folio); folio_set_dirty(folio); return true; } /** * free_swap_and_cache_nr() - Release reference on range of swap entries and * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). */ void free_swap_and_cache_nr(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; struct swap_info_struct *si; bool any_only_cache = false; unsigned long offset; if (non_swap_entry(entry)) return; si = get_swap_device(entry); if (!si) return; if (WARN_ON(end_offset > si->max)) goto out; /* * First free all entries in the range. */ any_only_cache = __swap_entries_free(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their * reference drop to zero. */ if (!any_only_cache) goto out; /* * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim * the swap once per folio in the common case. If we do * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the * latter will get a reference and lock the folio for every individual * page but will only succeed once the swap slot for every subpage is * zero. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { /* * Folios are always naturally aligned in swap so * advance forward to the next boundary. Zero means no * folio was found for the swap entry, so advance by 1 * in this case. Negative value means folio was found * but could not be reclaimed. Here we can still advance * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0) nr = -nr; nr = ALIGN(offset + 1, nr) - offset; } } out: put_swap_device(si); } #ifdef CONFIG_HIBERNATION swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si = swap_type_to_swap_info(type); swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: return entry; } /* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). */ int swap_type_of(dev_t device, sector_t offset) { int type; if (!device) return -1; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); if (se->start_block == offset) { spin_unlock(&swap_lock); return type; } } } spin_unlock(&swap_lock); return -ENODEV; } int find_first_swap(dev_t *device) { int type; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; *device = sis->bdev->bd_dev; spin_unlock(&swap_lock); return type; } spin_unlock(&swap_lock); return -ENODEV; } /* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ sector_t swapdev_block(int type, pgoff_t offset) { struct swap_info_struct *si = swap_type_to_swap_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page); } /* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; spin_lock(&swap_lock); if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= sis->inuse_pages; } spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct folio *folio) { struct page *page; struct folio *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; bool hwpoisoned = false; int ret = 1; swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) return -ENOMEM; else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { hwpoisoned = true; folio = swapcache; } page = folio_file_page(folio, swp_offset(entry)); if (PageHWPoison(page)) hwpoisoned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), swp_entry_to_pte(entry)))) { ret = 0; goto out; } old_pte = ptep_get(pte); if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); if (hwpoisoned) { swp_entry = make_hwpoison_entry(page); } else { swp_entry = make_poisoned_swp_entry(); } new_pte = swp_entry_to_pte(swp_entry); ret = 0; goto setpte; } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_get(folio); if (folio == swapcache) { rmap_t rmap_flags = RMAP_NONE; /* * See do_swap_page(): writeback would be problematic. * However, we do a folio_wait_writeback() just before this * call and have the folio locked. */ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; /* * We currently only expect small !anon folios, which are either * fully exclusive or fully shared. If we ever get large folios * here, we have to be careful. */ if (!folio_test_anon(folio)) { VM_WARN_ON_ONCE(folio_test_large(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); } else { folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } } else { /* ksm created a completely new copy */ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: if (pte) pte_unmap_unlock(pte, ptl); if (folio != swapcache) { folio_unlock(folio); folio_put(folio); } return ret; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { pte_t *pte = NULL; struct swap_info_struct *si; si = swap_info[type]; do { struct folio *folio; unsigned long offset; unsigned char swp_count; swp_entry_t entry; int ret; pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); if (!pte) break; } ptent = ptep_get_lockless(pte); if (!is_swap_pte(ptent)) continue; entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; offset = swp_offset(entry); pte_unmap(pte); pte = NULL; folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct vm_fault vmf = { .vma = vma, .address = addr, .real_address = addr, .pmd = pmd, }; folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); if (swp_count == 0 || swp_count == SWAP_MAP_BAD) continue; return -ENOMEM; } folio_lock(folio); folio_wait_writeback(folio); ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } while (addr += PAGE_SIZE, addr != end); if (pte) pte_unmap(pte); return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, unsigned int type) { pmd_t *pmd; unsigned long next; int ret; pmd = pmd_offset(pud, addr); do { cond_resched(); next = pmd_addr_end(addr, end); ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0; } static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned int type) { pud_t *pud; unsigned long next; int ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0; } static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned int type) { p4d_t *p4d; unsigned long next; int ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; int ret; addr = vma->vm_start; end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); if (ret) break; } cond_resched(); } mmap_read_unlock(mm); return ret; } /* * Scan swap_map from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; unsigned char count; /* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } if (i == si->max) i = 0; return i; } static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; struct folio *folio; swp_entry_t entry; unsigned int i; if (!READ_ONCE(si->inuse_pages)) goto success; retry: retval = shmem_unuse(type); if (retval) return retval; prev_mm = &init_mm; mmget(prev_mm); spin_lock(&mmlist_lock); p = &init_mm.mmlist; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); return retval; } /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); i = 0; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) continue; /* * It is conceivable that a racing task removed this folio from * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But * that is okay, folio_free_swap() only removes stale folios. */ folio_lock(folio); folio_wait_writeback(folio); folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } /* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; * and even shmem_writepage() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) goto retry; return -EINTR; } success: /* * Make sure that further cleanups after try_to_unuse() returns happen * after swap_range_free() reduces si->inuse_pages to 0. */ smp_mb(); return 0; } /* * After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be * added to the mmlist just after page_duplicate - before would be racy. */ static void drain_mmlist(void) { struct list_head *p, *next; unsigned int type; for (type = 0; type < nr_swapfiles; type++) if (swap_info[type]->inuse_pages) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) list_del_init(p); spin_unlock(&mmlist_lock); } /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); rb_erase(rb, &sis->swap_extent_root); kfree(se); } if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate) mapping->a_ops->swap_deactivate(swap_file); } } /* * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * * This function rather assumes that it is called in ascending page order. */ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; /* * place the new node at the right most since the * function is called in ascending page order. */ while (*link) { parent = *link; link = &parent->rb_right; } if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ se->nr_pages += nr_pages; return 0; } } /* No merge, insert a new extent. */ new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; new_se->start_page = start_page; new_se->nr_pages = nr_pages; new_se->start_block = start_block; rb_link_node(&new_se->rb_node, parent, link); rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is * built at swapon time and is then used at swap_writepage/swap_read_folio * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. * This is done so that the main operating code can treat S_ISBLK and S_ISREG * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * * For all swap devices we set S_SWAPFILE across the life of the swapon. This * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; return ret; } if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { destroy_swap_extents(sis); return -ENOMEM; } return ret; } return generic_swapfile_activate(sis, swap_file, span); } static int swap_node(struct swap_info_struct *si) { struct block_device *bdev; if (si->bdev) bdev = si->bdev; else bdev = si->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { int i; if (prio >= 0) si->prio = prio; else si->prio = --least_priority; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; for_each_node(i) { if (si->prio >= 0) si->avail_lists[i].prio = -si->prio; else { if (swap_node(si) == i) si->avail_lists[i].prio = 1; else si->avail_lists[i].prio = -si->prio; } } si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; } static void _enable_swap_info(struct swap_info_struct *si) { si->flags |= SWP_WRITEOK; atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; assert_spin_locked(&swap_lock); /* * both lists are plists, and thus priority ordered. * swap_active_head needs to be priority ordered for swapoff(), * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. * swap_avail_head needs to be priority ordered for folio_alloc_swap(), * which allocates swap pages from the highest available priority * swap_info_struct. */ plist_add(&si->list, &swap_active_head); /* add to available list iff swap device is not full */ if (si->highest_bit) add_to_avail_list(si); } static void enable_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { spin_lock(&swap_lock); spin_lock(&si->lock); setup_swap_info(si, prio, swap_map, cluster_info, zeromap); spin_unlock(&si->lock); spin_unlock(&swap_lock); /* * Finished initializing swap device, now it's safe to reference it. */ percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); spin_lock(&si->lock); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } static bool __has_usable_swap(void) { return !plist_head_empty(&swap_active_head); } bool has_usable_swap(void) { bool ret; spin_lock(&swap_lock); ret = __has_usable_swap(); spin_unlock(&swap_lock); return ret; } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; struct filename *pathname; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; BUG_ON(!current->mm); pathname = getname(specialfile); if (IS_ERR(pathname)) return PTR_ERR(pathname); victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; mapping = victim->f_mapping; spin_lock(&swap_lock); plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; break; } } } if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; } if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; spin_unlock(&swap_lock); goto out_dput; } spin_lock(&p->lock); del_from_avail_list(p); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; for_each_node(nid) { if (si->avail_lists[nid].prio != 1) si->avail_lists[nid].prio--; } } least_priority++; } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; spin_unlock(&p->lock); spin_unlock(&swap_lock); disable_swap_slots_cache_lock(); set_current_oom_origin(); err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); reenable_swap_slots_cache_unlock(); goto out_dput; } reenable_swap_slots_cache_unlock(); /* * Wait for swap operations protected by get/put_swap_device() * to complete. Because of synchronize_rcu() here, all swap * operations protected by RCU reader side lock (including any * spinlock) will be waited too. This makes it easy to * prevent folio_test_swapcache() and the following swap cache * operations from racing with swapoff. */ percpu_ref_kill(&p->users); synchronize_rcu(); wait_for_completion(&p->comp); flush_work(&p->discard_work); flush_work(&p->reclaim_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); if (!p->bdev || !bdev_nonrot(p->bdev)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); drain_mmlist(); /* wait for anyone still in scan_swap_map_slots */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { spin_unlock(&p->lock); spin_unlock(&swap_lock); schedule_timeout_uninterruptible(1); spin_lock(&swap_lock); spin_lock(&p->lock); } swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); inode = mapping->host; inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; inode_unlock(inode); filp_close(swap_file, NULL); /* * Clear the SWP_USED flag after all resources are freed so that swapon * can reuse this swap_info in alloc_swap_info() safely. It is ok to * not hold p->lock after we cleared its SWP_WRITEOK. */ spin_lock(&swap_lock); p->flags = 0; spin_unlock(&swap_lock); err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); out: putname(pathname); return err; } #ifdef CONFIG_PROC_FS static __poll_t swaps_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); if (seq->poll_event != atomic_read(&proc_poll_event)) { seq->poll_event = atomic_read(&proc_poll_event); return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } return EPOLLIN | EPOLLRDNORM; } /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { struct swap_info_struct *si; int type; loff_t l = *pos; mutex_lock(&swapon_mutex); if (!l) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) return si; } return NULL; } static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { struct swap_info_struct *si = v; int type; if (v == SEQ_START_TOKEN) type = 0; else type = si->type + 1; ++(*pos); for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; } return NULL; } static void swap_stop(struct seq_file *swap, void *v) { mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) { struct swap_info_struct *si = v; struct file *file; int len; unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } bytes = K(si->pages); inuse = K(READ_ONCE(si->inuse_pages)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } static const struct seq_operations swaps_op = { .start = swap_start, .next = swap_next, .stop = swap_stop, .show = swap_show }; static int swaps_open(struct inode *inode, struct file *file) { struct seq_file *seq; int ret; ret = seq_open(file, &swaps_op); if (ret) return ret; seq = file->private_data; seq->poll_event = atomic_read(&proc_poll_event); return 0; } static const struct proc_ops swaps_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = swaps_poll, }; static int __init procswaps_init(void) { proc_create("swaps", 0, NULL, &swaps_proc_ops); return 0; } __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ #ifdef MAX_SWAPFILES_CHECK static int __init max_swapfiles_check(void) { MAX_SWAPFILES_CHECK(); return 0; } late_initcall(max_swapfiles_check); #endif static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; int i; p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (percpu_ref_init(&p->users, swap_users_ref_free, PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { kvfree(p); return ERR_PTR(-ENOMEM); } spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); percpu_ref_exit(&p->users); kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; /* * Publish the swap_info_struct after initializing it. * Note that kvzalloc() above zeroes all its fields. */ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ nr_swapfiles++; } else { defer = p; p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() * would be relying on p->type to remain valid. */ } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); for_each_node(i) plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { percpu_ref_exit(&defer->users); kvfree(defer); } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); init_completion(&p->comp); return p; } static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) { if (S_ISBLK(inode->i_mode)) { si->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ if (bdev_is_zoned(si->bdev)) return -EINVAL; si->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { si->bdev = inode->i_sb->s_bdev; } return 0; } /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: * 1) the number of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte, as defined by the different * architectures. * * In order to find the largest possible bit mask, a swap entry with * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap offset is * extracted. * * This will mask all the bits from the initial ~0UL mask that can't * be encoded in either the swp_entry_t or the architecture definition * of a swap pte. */ unsigned long generic_max_swapfile_size(void) { return swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; } /* Can be overridden by an architecture for additional checks. */ __weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } static unsigned long read_swap_header(struct swap_info_struct *si, union swap_header *swap_header, struct inode *inode) { int i; unsigned long maxpages; unsigned long swapfilepages; unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { pr_err("Unable to find swap-space signature\n"); return 0; } /* swap partition endianness hack... */ if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { pr_warn("Unable to handle swap header version %d\n", swap_header->info.version); return 0; } si->lowest_bit = 1; si->cluster_next = 1; si->cluster_nr = 0; maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); return 0; } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", K(maxpages), K(last_page)); } if (maxpages > last_page) { maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } si->highest_bit = maxpages - 1; if (!maxpages) return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; return maxpages; } #define SWAP_CLUSTER_INFO_COLS \ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) #define SWAP_CLUSTER_SPACE_COLS \ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) static int setup_swap_map_and_extents(struct swap_info_struct *si, union swap_header *swap_header, unsigned char *swap_map, unsigned long maxpages, sector_t *span) { unsigned int nr_good_pages; unsigned long i; int nr_extents; nr_good_pages = maxpages - 1; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; } } if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; si->max = maxpages; si->pages = nr_good_pages; nr_extents = setup_swap_extents(si, span); if (nr_extents < 0) return nr_extents; nr_good_pages = si->pages; } if (!nr_good_pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } return nr_extents; } static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; struct swap_cluster_info *cluster_info; unsigned long i, j, k, idx; int cpu, err = -ENOMEM; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) goto err; for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); si->cluster_next_cpu = alloc_percpu(unsigned int); if (!si->cluster_next_cpu) goto err_free; /* Random start position to help with wear leveling */ for_each_possible_cpu(cpu) per_cpu(*si->cluster_next_cpu, cpu) = get_random_u32_inclusive(1, si->highest_bit); si->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!si->percpu_cluster) goto err_free; for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; cluster = per_cpu_ptr(si->percpu_cluster, cpu); for (i = 0; i < SWAP_NR_ORDERS; i++) cluster->next[i] = SWAP_NEXT_INVALID; } /* * Mark unusable pages as unavailable. The clusters aren't * marked free yet, so no list operations are involved yet. * * See setup_swap_map_and_extents(): header page, bad pages, * and the EOF part of the last cluster. */ inc_cluster_info_page(si, cluster_info, 0); for (i = 0; i < swap_header->info.nr_badpages; i++) inc_cluster_info_page(si, cluster_info, swap_header->info.badpages[i]); for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); INIT_LIST_HEAD(&si->discard_clusters); for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); si->frag_cluster_nr[i] = 0; } /* * Reduce false cache line sharing between cluster_info and * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; ci = cluster_info + idx; if (idx >= nr_clusters) continue; if (ci->count) { ci->flags = CLUSTER_FLAG_NONFULL; list_add_tail(&ci->list, &si->nonfull_clusters[0]); continue; } ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } } return cluster_info; err_free: kvfree(cluster_info); err: return ERR_PTR(err); } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *si; struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; struct dentry *dentry; int prio; int error; union swap_header *swap_header; int nr_extents; sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; unsigned long *zeromap = NULL; struct swap_cluster_info *cluster_info = NULL; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!swap_avail_heads) return -ENOMEM; si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); INIT_WORK(&si->discard_work, swap_discard_work); INIT_WORK(&si->reclaim_work, swap_reclaim_work); name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); name = NULL; goto bad_swap; } swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; goto bad_swap; } si->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(si, inode); if (unlikely(error)) goto bad_swap; inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; } /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } folio = read_mapping_folio(mapping, 0, swap_file); if (IS_ERR(folio)) { error = PTR_ERR(folio); goto bad_swap_unlock_inode; } swap_header = kmap_local_folio(folio, 0); maxpages = read_swap_header(si, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; } /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap_unlock_inode; } error = swap_cgroup_swapon(si->type, maxpages); if (error) goto bad_swap_unlock_inode; nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap_unlock_inode; } /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!zeromap) { error = -ENOMEM; goto bad_swap_unlock_inode; } if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |= SWP_STABLE_WRITES; if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; cluster_info = setup_clusters(si, swap_header, maxpages); if (IS_ERR(cluster_info)) { error = PTR_ERR(cluster_info); cluster_info = NULL; goto bad_swap_unlock_inode; } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* * By flagging sys_swapon, a sysadmin can tell us to * either do single-time area discards only, or to just * perform discards for released swap page-clusters. * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) si->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) si->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ if (si->flags & SWP_AREA_DISCARD) { int err = discard_swap(si); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", si, err); } } error = init_swap_address_space(si->type, maxpages); if (error) goto bad_swap_unlock_inode; error = zswap_swapon(si->type, maxpages); if (error) goto free_swap_address_space; /* * Flush any pending IO and dirty mappings before we start using this * swap device. */ inode->i_flags |= S_SWAPFILE; error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; goto free_swap_zswap; } mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, K((unsigned long long)span), (si->flags & SWP_SOLIDSTATE) ? "SS" : "", (si->flags & SWP_DISCARDABLE) ? "D" : "", (si->flags & SWP_AREA_DISCARD) ? "s" : "", (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); error = 0; goto out; free_swap_zswap: zswap_swapoff(si->type); free_swap_address_space: exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; free_percpu(si->cluster_next_cpu); si->cluster_next_cpu = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); spin_lock(&swap_lock); si->swap_file = NULL; si->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: if (!IS_ERR_OR_NULL(folio)) folio_release_kmap(folio, swap_header); if (name) putname(name); if (inode) inode_unlock(inode); if (!error) enable_swap_slots_cache(); return error; } void si_swapinfo(struct sysinfo *val) { unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += READ_ONCE(si->inuse_pages); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } /* * Verify that nr swap entries are valid and increment their swap map counts. * * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; int err, i; si = swp_swap_info(entry); offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); ci = lock_cluster_or_swap_info(si, offset); err = 0; for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; /* * swapin_readahead() doesn't check if a swap entry is valid, so the * swap entry could be SWAP_MAP_BAD. Check here with lock held. */ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { err = -ENOENT; goto unlock_out; } has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (!count && !has_cache) { err = -ENOENT; } else if (usage == SWAP_HAS_CACHE) { if (has_cache) err = -EEXIST; } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { err = -EINVAL; } if (err) goto unlock_out; } for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) has_cache = SWAP_HAS_CACHE; else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; else { /* * Don't need to rollback changes, because if * usage == 1, there must be nr == 1. */ err = -ENOMEM; goto unlock_out; } WRITE_ONCE(si->swap_map[offset + i], count | has_cache); } unlock_out: unlock_cluster_or_swap_info(si, ci); return err; } /* * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ void swap_shmem_alloc(swp_entry_t entry, int nr) { __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); } /* * Increase reference count of swap entry by 1. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. */ int swap_duplicate(swp_entry_t entry) { int err = 0; while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* * @entry: first swap entry from which we allocate nr swap cache. * * Called when allocating swap cache for existing swap entries, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry, int nr) { return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return swap_type_to_swap_info(swp_type(entry)); } /* * out-of-line methods to avoid include hell. */ struct address_space *swapcache_mapping(struct folio *folio) { return swp_swap_info(folio->swap)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __folio_swap_cache_index(struct folio *folio) { return swap_cache_index(folio->swap); } EXPORT_SYMBOL_GPL(__folio_swap_cache_index); /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. * * These continuation pages are seldom referenced: the common paths all work * on the original swap_map, only referring to a continuation page when the * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. * * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) * can be called after dropping locks. */ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; pgoff_t offset; unsigned char count; int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better * for latency not to zero a page while GFP_ATOMIC and holding locks. */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing * __swap_duplicate(): the swap device may be swapoff */ goto outer; } spin_lock(&si->lock); offset = swp_offset(entry); ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* * The higher the swap count, the more likely it is that tasks * will race to add swap count continuation: we need to avoid * over-provisioning. */ goto out; } if (!page) { ret = -ENOMEM; goto out; } head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; spin_lock(&si->cont_lock); /* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. */ if (!page_private(head)) { BUG_ON(count & COUNT_CONTINUED); INIT_LIST_HEAD(&head->lru); set_page_private(head, SWP_CONTINUED); si->flags |= SWP_CONTINUED; } list_for_each_entry(list_page, &head->lru, lru) { unsigned char *map; /* * If the previous map said no continuation, but we've found * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) goto out_unlock_cont; map = kmap_local_page(list_page) + offset; count = *map; kunmap_local(map); /* * If this continuation count now has some space in it, * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) goto out_unlock_cont; } list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); spin_unlock(&si->lock); put_swap_device(si); outer: if (page) __free_page(page); return ret; } /* * swap_count_continued - when the original swap_map count is incremented * from SWAP_MAP_MAX, check if there is already a continuation page to carry * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) { struct page *head; struct page *page; unsigned char *map; bool ret; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { BUG_ON(count & COUNT_CONTINUED); return false; /* need to add count continuation */ } spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_next_entry(head, lru); map = kmap_local_page(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ /* * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_local(map); page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; } map = kmap_local_page(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = COUNT_CONTINUED; kunmap_local(map); } ret = true; /* incremented */ } else { /* decrementing */ /* * Think of how you subtract 1 from 1000 */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_local(map); } ret = count == COUNT_CONTINUED; } out: spin_unlock(&si->cont_lock); return ret; } /* * free_swap_count_continuations - swapoff free all the continuation pages * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. */ static void free_swap_count_continuations(struct swap_info_struct *si) { pgoff_t offset; for (offset = 0; offset < si->max; offset += PAGE_SIZE) { struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { struct page *page, *next; list_for_each_entry_safe(page, next, &head->lru, lru) { list_del(&page->lru); __free_page(page); } } } } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; if (!__has_usable_swap()) return; if (!blk_cgroup_congested()) return; /* * We've already scheduled a throttle, avoid taking the global swap * lock. */ if (current->throttle_disk) return; spin_lock(&swap_avail_lock); plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } spin_unlock(&swap_avail_lock); } #endif static int __init swapfile_init(void) { int nid; swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), GFP_KERNEL); if (!swap_avail_heads) { pr_emerg("Not enough memory for swap heads, swap is disabled\n"); return -ENOMEM; } for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); swapfile_maximum_size = arch_max_swapfile_size(); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; #endif /* CONFIG_MIGRATION */ return 0; } subsys_initcall(swapfile_init);
5 3 3 1 1 3 3 3 3 3 8 8 5 3 3 5 5 5 1 1 1 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_attr_sf.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_errortag.h" #include "xfs_health.h" /* * xfs_attr_leaf.c * * Routines to implement leaf blocks of attributes as Btrees of hashed names. */ /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ /* * Routines used for growing the Btree. */ STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, struct xfs_buf **bpp); STATIC void xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_da_args *args, int freemap_index); STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_buf *leaf_buffer); STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, xfs_da_state_blk_t *leaf_blk_1, struct xfs_attr3_icleaf_hdr *ichdr1, xfs_da_state_blk_t *leaf_blk_2, struct xfs_attr3_icleaf_hdr *ichdr2, int *number_entries_in_blk1, int *number_usedbytes_in_blk1); /* * Utility routines. */ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, struct xfs_attr_leafblock *src_leaf, struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, struct xfs_attr_leafblock *dst_leaf, struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); /* * attr3 block 'firstused' conversion helpers. * * firstused refers to the offset of the first used byte of the nameval region * of an attr leaf block. The region starts at the tail of the block and expands * backwards towards the middle. As such, firstused is initialized to the block * size for an empty leaf block and is reduced from there. * * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. * The in-core firstused field is 32-bit and thus supports the maximum fsb size. * The on-disk field is only 16-bit, however, and overflows at 64k. Since this * only occurs at exactly 64k, we use zero as a magic on-disk value to represent * the attr block size. The following helpers manage the conversion between the * in-core and on-disk formats. */ static void xfs_attr3_leaf_firstused_from_disk( struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { struct xfs_attr3_leaf_hdr *hdr3; if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { hdr3 = (struct xfs_attr3_leaf_hdr *) from; to->firstused = be16_to_cpu(hdr3->firstused); } else { to->firstused = be16_to_cpu(from->hdr.firstused); } /* * Convert from the magic fsb size value to actual blocksize. This * should only occur for empty blocks when the block size overflows * 16-bits. */ if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { ASSERT(!to->count && !to->usedbytes); ASSERT(geo->blksize > USHRT_MAX); to->firstused = geo->blksize; } } static void xfs_attr3_leaf_firstused_to_disk( struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { struct xfs_attr3_leaf_hdr *hdr3; uint32_t firstused; /* magic value should only be seen on disk */ ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); /* * Scale down the 32-bit in-core firstused value to the 16-bit on-disk * value. This only overflows at the max supported value of 64k. Use the * magic on-disk value to represent block size in this case. */ firstused = from->firstused; if (firstused > USHRT_MAX) { ASSERT(from->firstused == geo->blksize); firstused = XFS_ATTR3_LEAF_NULLOFF; } if (from->magic == XFS_ATTR3_LEAF_MAGIC) { hdr3 = (struct xfs_attr3_leaf_hdr *) to; hdr3->firstused = cpu_to_be16(firstused); } else { to->hdr.firstused = cpu_to_be16(firstused); } } void xfs_attr3_leaf_hdr_from_disk( struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { int i; ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; to->forw = be32_to_cpu(hdr3->info.hdr.forw); to->back = be32_to_cpu(hdr3->info.hdr.back); to->magic = be16_to_cpu(hdr3->info.hdr.magic); to->count = be16_to_cpu(hdr3->count); to->usedbytes = be16_to_cpu(hdr3->usedbytes); xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = hdr3->holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); } return; } to->forw = be32_to_cpu(from->hdr.info.forw); to->back = be32_to_cpu(from->hdr.info.back); to->magic = be16_to_cpu(from->hdr.info.magic); to->count = be16_to_cpu(from->hdr.count); to->usedbytes = be16_to_cpu(from->hdr.usedbytes); xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = from->hdr.holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); } } void xfs_attr3_leaf_hdr_to_disk( struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { int i; ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || from->magic == XFS_ATTR3_LEAF_MAGIC); if (from->magic == XFS_ATTR3_LEAF_MAGIC) { struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; hdr3->info.hdr.forw = cpu_to_be32(from->forw); hdr3->info.hdr.back = cpu_to_be32(from->back); hdr3->info.hdr.magic = cpu_to_be16(from->magic); hdr3->count = cpu_to_be16(from->count); hdr3->usedbytes = cpu_to_be16(from->usedbytes); xfs_attr3_leaf_firstused_to_disk(geo, to, from); hdr3->holes = from->holes; hdr3->pad1 = 0; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); } return; } to->hdr.info.forw = cpu_to_be32(from->forw); to->hdr.info.back = cpu_to_be32(from->back); to->hdr.info.magic = cpu_to_be16(from->magic); to->hdr.count = cpu_to_be16(from->count); to->hdr.usedbytes = cpu_to_be16(from->usedbytes); xfs_attr3_leaf_firstused_to_disk(geo, to, from); to->hdr.holes = from->holes; to->hdr.pad1 = 0; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); } } static xfs_failaddr_t xfs_attr3_leaf_verify_entry( struct xfs_mount *mp, char *buf_end, struct xfs_attr_leafblock *leaf, struct xfs_attr3_icleaf_hdr *leafhdr, struct xfs_attr_leaf_entry *ent, int idx, __u32 *last_hashval) { struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; char *name_end; unsigned int nameidx; unsigned int namesize; __u32 hashval; /* hash order check */ hashval = be32_to_cpu(ent->hashval); if (hashval < *last_hashval) return __this_address; *last_hashval = hashval; nameidx = be16_to_cpu(ent->nameidx); if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize) return __this_address; /* * Check the name information. The namelen fields are u8 so we can't * possibly exceed the maximum name length of 255 bytes. */ if (ent->flags & XFS_ATTR_LOCAL) { lentry = xfs_attr3_leaf_name_local(leaf, idx); namesize = xfs_attr_leaf_entsize_local(lentry->namelen, be16_to_cpu(lentry->valuelen)); name_end = (char *)lentry + namesize; if (lentry->namelen == 0) return __this_address; } else { rentry = xfs_attr3_leaf_name_remote(leaf, idx); namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); name_end = (char *)rentry + namesize; if (rentry->namelen == 0) return __this_address; if (!(ent->flags & XFS_ATTR_INCOMPLETE) && rentry->valueblk == 0) return __this_address; } if (name_end > buf_end) return __this_address; return NULL; } /* * Validate an attribute leaf block. * * Empty leaf blocks can occur under the following circumstances: * * 1. setxattr adds a new extended attribute to a file; * 2. The file has zero existing attributes; * 3. The attribute is too large to fit in the attribute fork; * 4. The attribute is small enough to fit in a leaf block; * 5. A log flush occurs after committing the transaction that creates * the (empty) leaf block; and * 6. The filesystem goes down after the log flush but before the new * attribute can be committed to the leaf block. * * Hence we need to ensure that we don't fail the validation purely * because the leaf is empty. */ static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_mount *mp = bp->b_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_entry *ent; char *buf_end; uint32_t end; /* must be 32bit - see below */ __u32 last_hashval = 0; int i; xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); if (fa) return fa; /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ if (ichdr.firstused > mp->m_attr_geo->blksize) return __this_address; if (ichdr.firstused < xfs_attr3_leaf_hdr_size(leaf)) return __this_address; /* Make sure the entries array doesn't crash into the name info. */ entries = xfs_attr3_leaf_entryp(bp->b_addr); if ((char *)&entries[ichdr.count] > (char *)bp->b_addr + ichdr.firstused) return __this_address; /* * NOTE: This verifier historically failed empty leaf buffers because * we expect the fork to be in another format. Empty attr fork format * conversions are possible during xattr set, however, and format * conversion is not atomic with the xattr set that triggers it. We * cannot assume leaf blocks are non-empty until that is addressed. */ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, ent, i, &last_hashval); if (fa) return fa; } /* * Quickly check the freemap information. Attribute data has to be * aligned to 4-byte boundaries, and likewise for the free space. * * Note that for 64k block size filesystems, the freemap entries cannot * overflow as they are only be16 fields. However, when checking end * pointer of the freemap, we have to be careful to detect overflows and * so use uint32_t for those checks. */ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr.freemap[i].base > mp->m_attr_geo->blksize) return __this_address; if (ichdr.freemap[i].base & 0x3) return __this_address; if (ichdr.freemap[i].size > mp->m_attr_geo->blksize) return __this_address; if (ichdr.freemap[i].size & 0x3) return __this_address; /* be care of 16 bit overflows here */ end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size; if (end < ichdr.freemap[i].base) return __this_address; if (end > mp->m_attr_geo->blksize) return __this_address; } return NULL; } xfs_failaddr_t xfs_attr3_leaf_header_check( struct xfs_buf *bp, xfs_ino_t owner) { struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_attr3_leafblock *hdr3 = bp->b_addr; if (hdr3->hdr.info.hdr.magic != cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) return __this_address; if (be64_to_cpu(hdr3->hdr.info.owner) != owner) return __this_address; } return NULL; } static void xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; fa = xfs_attr3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (!xfs_has_crc(mp)) return; if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); } /* * leaf/node format detection on trees is sketchy, so a node read can be done on * leaf level blocks when detection identifies the tree as a node format tree * incorrectly. In this case, we need to swap the verifier to match the correct * format of the block being read. */ static void xfs_attr3_leaf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_attr3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, }; int xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp) { xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); if (err || !(*bpp)) return err; fa = xfs_attr3_leaf_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); return 0; } /*======================================================================== * Namespace helper routines *========================================================================*/ /* * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE * flag on disk - if there's an incomplete attr then recovery needs to tear it * down. If there's no incomplete attr, then recovery needs to tear that attr * down to replace it with the attr that has been logged. In this case, the * INCOMPLETE flag will not be set in attr->attr_filter, but rather * XFS_DA_OP_RECOVERY will be set in args->op_flags. */ static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args) { if (args->op_flags & XFS_DA_OP_RECOVERY) return XFS_ATTR_NSP_ONDISK_MASK; return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE; } static inline bool xfs_attr_parent_match( const struct xfs_da_args *args, const void *value, unsigned int valuelen) { ASSERT(args->value != NULL); /* Parent pointers do not use remote values */ if (!value) return false; /* * The only value we support is a parent rec. However, we'll accept * any valuelen so that offline repair can delete ATTR_PARENT values * that are not parent pointers. */ if (valuelen != args->valuelen) return false; return memcmp(args->value, value, valuelen) == 0; } static bool xfs_attr_match( struct xfs_da_args *args, unsigned int attr_flags, const unsigned char *name, unsigned int namelen, const void *value, unsigned int valuelen) { unsigned int mask = xfs_attr_match_mask(args); if (args->namelen != namelen) return false; if ((args->attr_filter & mask) != (attr_flags & mask)) return false; if (memcmp(args->name, name, namelen) != 0) return false; if (attr_flags & XFS_ATTR_PARENT) return xfs_attr_parent_match(args, value, valuelen); return true; } static int xfs_attr_copy_value( struct xfs_da_args *args, unsigned char *value, int valuelen) { /* * Parent pointer lookups require the caller to specify the name and * value, so don't copy anything. */ if (args->attr_filter & XFS_ATTR_PARENT) return 0; /* * No copy if all we have to do is get the length */ if (!args->valuelen) { args->valuelen = valuelen; return 0; } /* * No copy if the length of the existing buffer is too small */ if (args->valuelen < valuelen) { args->valuelen = valuelen; return -ERANGE; } if (!args->value) { args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP); if (!args->value) return -ENOMEM; } args->valuelen = valuelen; /* remote block xattr requires IO for copy-in */ if (args->rmtblkno) return xfs_attr_rmtval_get(args); /* * This is to prevent a GCC warning because the remote xattr case * doesn't have a value to pass in. In that case, we never reach here, * but GCC can't work that out and so throws a "passing NULL to * memcpy" warning. */ if (!value) return -EINVAL; memcpy(args->value, value, valuelen); return 0; } /*======================================================================== * External routines when attribute fork size < XFS_LITINO(mp). *========================================================================*/ /* * Query whether the total requested number of attr fork bytes of extended * attribute space will be able to fit inline. * * Returns zero if not, else the i_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * * i_forkoff must be 8 byte aligned, hence is stored as a >>3 value; * special case for dev/uuid inodes, they have fixed size data forks. */ int xfs_attr_shortform_bytesfit( struct xfs_inode *dp, int bytes) { struct xfs_mount *mp = dp->i_mount; int64_t dsize; int minforkoff; int maxforkoff; int offset; /* * Check if the new size could fit at all first: */ if (bytes > XFS_LITINO(mp)) return 0; /* rounded down */ offset = (XFS_LITINO(mp) - bytes) >> 3; if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; } /* * If the requested numbers of bytes is smaller or equal to the * current attribute fork size we can always proceed. * * Note that if_bytes in the data fork might actually be larger than * the current data fork size is due to delalloc extents. In that * case either the extent count will go down when they are converted * to real extents, or the delalloc conversion will take care of the * literal area rebalancing. */ if (bytes <= xfs_inode_attr_fork_size(dp)) return dp->i_forkoff; /* * For attr2 we can try to move the forkoff if there is space in the * literal area, but for the old format we are done if there is no * space in the fixed attribute fork. */ if (!xfs_has_attr2(mp)) return 0; dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* * If there is no attr fork and the data fork is extents, * determine if creating the default attr fork will result * in the extents form migrating to btree. If so, the * minimum offset only needs to be the space required for * the btree root. */ if (!dp->i_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) dsize = xfs_bmdr_space_calc(MINDBTPTRS); break; case XFS_DINODE_FMT_BTREE: /* * If we have a data btree then keep forkoff if we have one, * otherwise we are adding a new attr, so then we set * minforkoff to where the btree root can finish so we have * plenty of room for attrs */ if (dp->i_forkoff) { if (offset < dp->i_forkoff) return 0; return dp->i_forkoff; } dsize = xfs_bmap_bmdr_space(dp->i_df.if_broot); break; } /* * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ minforkoff = max_t(int64_t, dsize, xfs_bmdr_space_calc(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ maxforkoff = XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) return maxforkoff; if (offset >= minforkoff) return offset; return 0; } /* * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: * - noattr2 mount option is set, * - on-disk version bit says it is already set, or * - the attr2 mount option is not set to enable automatic upgrade from attr1. */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { if (xfs_has_noattr2(mp)) return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; if (!xfs_has_attr2(mp)) return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); } /* * Create the initial contents of a shortform attribute list. */ void xfs_attr_shortform_create( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); ASSERT(ifp->if_bytes == 0); if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } /* * Return the entry if the attr in args is found, or NULL if not. */ struct xfs_attr_sf_entry * xfs_attr_sf_findname( struct xfs_da_args *args) { struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; for (sfe = xfs_attr_sf_firstentry(sf); sfe < xfs_attr_sf_endptr(sf); sfe = xfs_attr_sf_nextentry(sfe)) { if (xfs_attr_match(args, sfe->flags, sfe->nameval, sfe->namelen, &sfe->nameval[sfe->namelen], sfe->valuelen)) return sfe; } return NULL; } /* * Add a name/value pair to the shortform attribute list. * Overflow from the inode has already been checked for. */ void xfs_attr_shortform_add( struct xfs_da_args *args, int forkoff) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; int size; trace_xfs_attr_sf_add(args); dp->i_forkoff = forkoff; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(!xfs_attr_sf_findname(args)); size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK); sfe = xfs_attr_sf_endptr(sf); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->count++; be16_add_cpu(&sf->totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); } /* * After the last attribute is removed revert to original inode format, * making all literal area available to the data fork once more. */ void xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { ASSERT(ip->i_af.if_nextents == 0); xfs_ifork_zap_attr(ip); ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } /* * Remove an attribute from the shortform attribute list structure. */ int xfs_attr_sf_removename( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; uint16_t totsize = be16_to_cpu(sf->totsize); void *next, *end; int size = 0; trace_xfs_attr_sf_remove(args); sfe = xfs_attr_sf_findname(args); if (!sfe) { /* * If we are recovering an operation, finding nothing to remove * is not an error, it just means there was nothing to clean up. */ if (args->op_flags & XFS_DA_OP_RECOVERY) return 0; return -ENOATTR; } /* * Fix up the attribute fork data, covering the hole */ size = xfs_attr_sf_entsize(sfe); next = xfs_attr_sf_nextentry(sfe); end = xfs_attr_sf_endptr(sf); if (next < end) memmove(sfe, next, end - next); sf->count--; totsize -= size; sf->totsize = cpu_to_be16(totsize); /* * Fix up the start offset of the attribute fork */ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } xfs_sbversion_add_attr2(mp, args->trans); return 0; } /* * Retrieve the attribute value and length. * * If args->valuelen is zero, only the length needs to be returned. Unlike a * lookup, we only return an error if the attribute does not exist or we can't * retrieve the value. */ int xfs_attr_shortform_getvalue( struct xfs_da_args *args) { struct xfs_attr_sf_entry *sfe; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); trace_xfs_attr_sf_lookup(args); sfe = xfs_attr_sf_findname(args); if (!sfe) return -ENOATTR; return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], sfe->valuelen); } /* Convert from using the shortform to the leaf format. */ int xfs_attr_shortform_to_leaf( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; int size = be16_to_cpu(sf->totsize); struct xfs_da_args nargs; char *tmpbuffer; int error, i; xfs_dablk_t blkno; struct xfs_buf *bp; trace_xfs_attr_sf_to_leaf(args); tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, ifp->if_data, size); sf = (struct xfs_attr_sf_hdr *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); bp = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) goto out; ASSERT(blkno == 0); error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) goto out; memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; nargs.geo = args->geo; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; nargs.owner = args->owner; sfe = xfs_attr_sf_firstentry(sf); for (i = 0; i < sf->count; i++) { nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; if (!xfs_attr_check_namespace(sfe->flags)) { xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto out; } xfs_attr_sethash(&nargs); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); if (!xfs_attr3_leaf_add(bp, &nargs)) ASSERT(0); sfe = xfs_attr_sf_nextentry(sfe); } error = 0; out: kfree(tmpbuffer); return error; } /* * Check a leaf attribute block to see if all the entries would fit into * a shortform attribute list. */ int xfs_attr_shortform_allfit( struct xfs_buf *bp, struct xfs_inode *dp) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; struct xfs_attr3_icleaf_hdr leafhdr; int bytes; int i; struct xfs_mount *mp = bp->b_mount; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); bytes = sizeof(struct xfs_attr_sf_hdr); for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) return 0; name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } if (xfs_has_attr2(dp->i_mount) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); } /* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( struct xfs_attr_sf_hdr *sfp, size_t size) { struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp); struct xfs_attr_sf_entry *next_sfep; char *endp; int i; /* * Give up if the attribute is way too short. */ if (size < sizeof(struct xfs_attr_sf_hdr)) return __this_address; endp = (char *)sfp + size; /* Check all reported entries */ for (i = 0; i < sfp->count; i++) { /* * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are * within the data buffer. * xfs_attr_sf_entry is defined with a 1-byte variable * array at the end, so we must subtract that off. */ if (((char *)sfep + sizeof(*sfep)) >= endp) return __this_address; /* Don't allow names with known bad length. */ if (sfep->namelen == 0) return __this_address; /* * Check that the variable-length part of the structure is * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ next_sfep = xfs_attr_sf_nextentry(sfep); if ((char *)next_sfep > endp) return __this_address; /* * Check for unknown flags. Short form doesn't support * the incomplete or local bits, so we can use the namespace * mask here. */ if (sfep->flags & ~XFS_ATTR_NSP_ONDISK_MASK) return __this_address; /* * Check for invalid namespace combinations. We only allow * one namespace flag per xattr, so we can just count the * bits (i.e. hweight) here. */ if (!xfs_attr_check_namespace(sfep->flags)) return __this_address; sfep = next_sfep; } if ((void *)sfep != (void *)endp) return __this_address; return NULL; } /* * Convert a leaf attribute list to shortform attribute list */ int xfs_attr3_leaf_to_shortform( struct xfs_buf *bp, struct xfs_da_args *args, int forkoff) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_da_args nargs; struct xfs_inode *dp = args->dp; char *tmpbuffer; int error; int i; trace_xfs_attr_leaf_to_sf(args); tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ memset(bp->b_addr, 0, args->geo->blksize); /* * Clean out the prior contents of the attribute list. */ error = xfs_da_shrink_inode(args, 0, bp); if (error) goto out; if (forkoff == -1) { /* * Don't remove the attr fork if this operation is the first * part of a attr replace operations. We're going to add a new * attr immediately, so we need to keep the attr fork around in * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } goto out; } xfs_attr_shortform_create(args); /* * Copy the attributes */ memset((char *)&nargs, 0, sizeof(nargs)); nargs.geo = args->geo; nargs.dp = dp; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; nargs.owner = args->owner; for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!entry->nameidx) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); name_loc = xfs_attr3_leaf_name_local(leaf, i); nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK; xfs_attr_shortform_add(&nargs, forkoff); } error = 0; out: kvfree(tmpbuffer); return error; } /* * Convert from using a single leaf to a root node and a leaf. */ int xfs_attr3_leaf_to_node( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr icleafhdr; struct xfs_attr_leaf_entry *entries; struct xfs_da3_icnode_hdr icnodehdr; struct xfs_da_intnode *node; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp1 = NULL; struct xfs_buf *bp2 = NULL; xfs_dablk_t blkno; int error; trace_xfs_attr_leaf_to_node(args); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } error = xfs_da_grow_inode(args, &blkno); if (error) goto out; error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1); if (error) goto out; error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK); if (error) goto out; /* * Copy leaf to new buffer and log it. */ xfs_da_buf_copy(bp2, bp1, args->geo->blksize); xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* * Set up the new root node. */ error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; node = bp1->b_addr; xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node); leaf = bp2->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval; icnodehdr.btree[0].before = cpu_to_be32(blkno); icnodehdr.count = 1; xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr); xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: return error; } /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ /* * Create the initial contents of a leaf attribute list * or a leaf in a node attribute list. */ STATIC int xfs_attr3_leaf_create( struct xfs_da_args *args, xfs_dablk_t blkno, struct xfs_buf **bpp) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; trace_xfs_attr_leaf_create(args); error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp, XFS_ATTR_FORK); if (error) return error; bp->b_ops = &xfs_attr3_leaf_buf_ops; xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; memset(leaf, 0, args->geo->blksize); memset(&ichdr, 0, sizeof(ichdr)); ichdr.firstused = args->geo->blksize; if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp->b_addr; ichdr.magic = XFS_ATTR3_LEAF_MAGIC; hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); } else { ichdr.magic = XFS_ATTR_LEAF_MAGIC; ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); } ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; return 0; } /* * Split the leaf node, rebalance, then add the new entry. * * Returns 0 if the entry was added, 1 if a further split is needed or a * negative error number otherwise. */ int xfs_attr3_leaf_split( struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk) { bool added; xfs_dablk_t blkno; int error; trace_xfs_attr_leaf_split(state->args); /* * Allocate space for a new leaf node. */ ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); error = xfs_da_grow_inode(state->args, &blkno); if (error) return error; error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) return error; newblk->blkno = blkno; newblk->magic = XFS_ATTR_LEAF_MAGIC; /* * Rebalance the entries across the two leaves. * NOTE: rebalance() currently depends on the 2nd block being empty. */ xfs_attr3_leaf_rebalance(state, oldblk, newblk); error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return error; /* * Save info on "old" attribute for "atomic rename" ops, leaf_add() * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the * "new" attrs info. Will need the "old" info to remove it later. * * Insert the "new" entry in the correct block. */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); added = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); added = xfs_attr3_leaf_add(newblk->bp, state->args); } /* * Update last hashval in each block since we added the name. */ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); if (!added) return 1; return 0; } /* * Add a name to the leaf attribute list structure. */ bool xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; int tablesize; int entsize; bool added = true; int sum; int tmp; int i; trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args, NULL); /* * Search through freemap for first-fit on new name length. * (may need to figure in size of entry struct too) */ tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf); for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { if (tablesize > ichdr.firstused) { sum += ichdr.freemap[i].size; continue; } if (!ichdr.freemap[i].size) continue; /* no space in this map */ tmp = entsize; if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); if (ichdr.freemap[i].size >= tmp) { xfs_attr3_leaf_add_work(bp, &ichdr, args, i); goto out_log_hdr; } sum += ichdr.freemap[i].size; } /* * If there are no holes in the address space of the block, * and we don't have enough freespace, then compaction will do us * no good and we should just give up. */ if (!ichdr.holes && sum < entsize) return false; /* * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ xfs_attr3_leaf_compact(args, &ichdr, bp); /* * After compaction, the block is guaranteed to have only one * free region, in freemap[0]. If it is not big enough, give up. */ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { added = false; goto out_log_hdr; } xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); return added; } /* * Add a name to a leaf attribute list structure. */ STATIC void xfs_attr3_leaf_add_work( struct xfs_buf *bp, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_da_args *args, int mapindex) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_mount *mp; int tmp; int i; trace_xfs_attr_leaf_add_work(args); leaf = bp->b_addr; ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); ASSERT(args->index >= 0 && args->index <= ichdr->count); /* * Force open some space in the entry array and fill it in. */ entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (args->index < ichdr->count) { tmp = ichdr->count - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); memmove(entry + 1, entry, tmp); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } ichdr->count++; /* * Allocate space for the new string (at the end of the run). */ mp = args->trans->t_mountp; ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); ASSERT(ichdr->freemap[mapindex].size >= xfs_attr_leaf_newentsize(args, NULL)); ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); entry->flags = args->attr_filter; if (tmp) entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_REPLACE) { if (!(args->op_flags & XFS_DA_OP_LOGGED)) entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; } } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); ASSERT((args->index == ichdr->count - 1) || (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* * For "remote" attribute values, simply note that we need to * allocate space for the "remote" value. We can't actually * allocate the extents in this transaction, and we can't decide * which blocks they should be as we might allocate more blocks * as part of this transaction (a split operation for example). */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, args->index); name_loc->namelen = args->namelen; name_loc->valuelen = cpu_to_be16(args->valuelen); memcpy((char *)name_loc->nameval, args->name, args->namelen); memcpy((char *)&name_loc->nameval[args->namelen], args->value, be16_to_cpu(name_loc->valuelen)); } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->namelen = args->namelen; memcpy((char *)name_rmt->name, args->name, args->namelen); entry->flags |= XFS_ATTR_INCOMPLETE; /* just in case */ name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); /* * Update the control info for this leaf node */ if (be16_to_cpu(entry->nameidx) < ichdr->firstused) ichdr->firstused = be16_to_cpu(entry->nameidx); ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf)); tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr->freemap[i].base == tmp) { ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); ichdr->freemap[i].size -= min_t(uint16_t, ichdr->freemap[i].size, sizeof(xfs_attr_leaf_entry_t)); } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); } /* * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void xfs_attr3_leaf_compact( struct xfs_da_args *args, struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { struct xfs_attr_leafblock *leaf_src; struct xfs_attr_leafblock *leaf_dst; struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; char *tmpbuffer; trace_xfs_attr_leaf_compact(args); tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; leaf_dst = bp->b_addr; /* * Copy the on-disk header back into the destination buffer to ensure * all the information in the header that is not part of the incore * header structure is preserved. */ memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); /* Initialise the incore headers */ ichdr_src = *ichdr_dst; /* struct copy */ ichdr_dst->firstused = args->geo->blksize; ichdr_dst->usedbytes = 0; ichdr_dst->count = 0; ichdr_dst->holes = 0; ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); ichdr_dst->freemap[0].size = ichdr_dst->firstused - ichdr_dst->freemap[0].base; /* write the header back to initialise the underlying buffer */ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, ichdr_src.count); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. */ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); kvfree(tmpbuffer); } /* * Compare two leaf blocks "order". * Return 0 unless leaf2 should go before leaf1. */ static int xfs_attr3_leaf_order( struct xfs_buf *leaf1_bp, struct xfs_attr3_icleaf_hdr *leaf1hdr, struct xfs_buf *leaf2_bp, struct xfs_attr3_icleaf_hdr *leaf2hdr) { struct xfs_attr_leaf_entry *entries1; struct xfs_attr_leaf_entry *entries2; entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && ((be32_to_cpu(entries2[0].hashval) < be32_to_cpu(entries1[0].hashval)) || (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { return 1; } return 0; } int xfs_attr_leaf_order( struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp) { struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; struct xfs_mount *mp = leaf1_bp->b_mount; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); } /* * Redistribute the attribute list entries between two leaf nodes, * taking into account the size of the new entry. * * NOTE: if new block is empty, then it will get the upper half of the * old block. At present, all (one) callers pass in an empty second block. * * This code adjusts the args->index/blkno and args->index2/blkno2 fields * to match what it is doing in splitting the attribute leaf block. Those * values are used in "atomic rename" operations on attributes. Note that * the "new" and "old" values can end up in different blocks. */ STATIC void xfs_attr3_leaf_rebalance( struct xfs_da_state *state, struct xfs_da_state_blk *blk1, struct xfs_da_state_blk *blk2) { struct xfs_da_args *args; struct xfs_attr_leafblock *leaf1; struct xfs_attr_leafblock *leaf2; struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; struct xfs_attr_leaf_entry *entries1; struct xfs_attr_leaf_entry *entries2; int count; int totallen; int max; int space; int swap; /* * Set up environment. */ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); ASSERT(ichdr2.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); /* * Check ordering of blocks, reverse if it makes things simpler. * * NOTE: Given that all (current) callers pass in an empty * second block, this code should never set "swap". */ swap = 0; if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { swap(blk1, blk2); /* swap structures rather than reconverting them */ swap(ichdr1, ichdr2); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; swap = 1; } /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. Then get * the direction to copy and the number of elements to move. * * "inleaf" is true if the new entry should be inserted into blk1. * If "swap" is also true, then reverse the sense of "inleaf". */ state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, blk2, &ichdr2, &count, &totallen); if (swap) state->inleaf = !state->inleaf; /* * Move any entries required from leaf to leaf: */ if (count < ichdr1.count) { /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ count = ichdr1.count - count; space = ichdr1.usedbytes - totallen; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf2 is the destination, compact it if it looks tight. */ max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. */ xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, ichdr1.count - count, leaf2, &ichdr2, 0, count); } else if (count > ichdr1.count) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ count -= ichdr1.count; space = totallen - ichdr1.usedbytes; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf1 is the destination, compact it if it looks tight. */ max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. */ xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, ichdr1.count, count); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. */ entries1 = xfs_attr3_leaf_entryp(leaf1); entries2 = xfs_attr3_leaf_entryp(leaf2); blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); /* * Adjust the expected index for insertion. * NOTE: this code depends on the (current) situation that the * second block was originally empty. * * If the insertion point moved to the 2nd block, we must adjust * the index. We must also track the entry just following the * new entry for use in an "atomic rename" operation, that entry * is always the "old" entry and the "new" entry is what we are * inserting. The index/blkno fields refer to the "old" entry, * while the index2/blkno2 fields refer to the "new" entry. */ if (blk1->index > ichdr1.count) { ASSERT(state->inleaf == 0); blk2->index = blk1->index - ichdr1.count; args->index = args->index2 = blk2->index; args->blkno = args->blkno2 = blk2->blkno; } else if (blk1->index == ichdr1.count) { if (state->inleaf) { args->index = blk1->index; args->blkno = blk1->blkno; args->index2 = 0; args->blkno2 = blk2->blkno; } else { /* * On a double leaf split, the original attr location * is already stored in blkno2/index2, so don't * overwrite it overwise we corrupt the tree. */ blk2->index = blk1->index - ichdr1.count; args->index = blk2->index; args->blkno = blk2->blkno; if (!state->extravalid) { /* * set the new attr location to match the old * one and let the higher level split code * decide where in the leaf to place it. */ args->index2 = blk2->index; args->blkno2 = blk2->blkno; } } } else { ASSERT(state->inleaf == 1); args->index = args->index2 = blk1->index; args->blkno = args->blkno2 = blk1->blkno; } } /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. * GROT: Is this really necessary? With other than a 512 byte blocksize, * GROT: there will always be enough room in either block for a new entry. * GROT: Do a double-split for this case? */ STATIC int xfs_attr3_leaf_figure_balance( struct xfs_da_state *state, struct xfs_da_state_blk *blk1, struct xfs_attr3_icleaf_hdr *ichdr1, struct xfs_da_state_blk *blk2, struct xfs_attr3_icleaf_hdr *ichdr2, int *countarg, int *usedbytesarg) { struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; struct xfs_attr_leaf_entry *entry; int count; int max; int index; int totallen = 0; int half; int lastdelta; int foundit = 0; int tmp; /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. */ max = ichdr1->count + ichdr2->count; half = (max + 1) * sizeof(*entry); half += ichdr1->usedbytes + ichdr2->usedbytes + xfs_attr_leaf_newentsize(state->args, NULL); half /= 2; lastdelta = state->args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { #define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) /* * The new entry is in the first block, account for it. */ if (count == blk1->index) { tmp = totallen + sizeof(*entry) + xfs_attr_leaf_newentsize(state->args, NULL); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); totallen = tmp; foundit = 1; } /* * Wrap around into the second block if necessary. */ if (count == ichdr1->count) { leaf1 = leaf2; entry = xfs_attr3_leaf_entryp(leaf1); index = 0; } /* * Figure out if next leaf entry would be too much. */ tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1, index); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); totallen = tmp; #undef XFS_ATTR_ABS } /* * Calculate the number of usedbytes that will end up in lower block. * If new entry not in lower block, fix up the count. */ totallen -= count * sizeof(*entry); if (foundit) { totallen -= sizeof(*entry) + xfs_attr_leaf_newentsize(state->args, NULL); } *countarg = count; *usedbytesarg = totallen; return foundit; } /*======================================================================== * Routines used for shrinking the Btree. *========================================================================*/ /* * Check a leaf block and its neighbors to see if the block should be * collapsed into one or the other neighbor. Always keep the block * with the smaller block number. * If the current block is over 50% full, don't try to join it, return 0. * If the block is empty, fill in the state structure and return 2. * If it can be collapsed, fill in the state structure and return 1. * If nothing can be done, return 0. * * GROT: allow for INCOMPLETE entries in calculation. */ int xfs_attr3_leaf_toosmall( struct xfs_da_state *state, int *action) { struct xfs_attr_leafblock *leaf; struct xfs_da_state_blk *blk; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_buf *bp; xfs_dablk_t blkno; int bytes; int forward; int error; int retval; int i; trace_xfs_attr_leaf_toosmall(state->args); /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; leaf = blk->bp->b_addr; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; if (bytes > (state->args->geo->blksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return 0; } /* * Check for the degenerate case of the block being empty. * If the block is empty, we'll simply delete it, no need to * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ if (ichdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ forward = (ichdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return error; if (retval) { *action = 0; } else { *action = 2; } return 0; } /* * Examine each sibling block to see if we can coalesce with * at least 25% free space to spare. We need to figure out * whether to merge with the forward or the backward block. * We prefer coalescing with the lower numbered sibling so as * to shrink an attribute list over time. */ /* start with smaller blk num */ forward = ichdr.forw < ichdr.back; for (i = 0; i < 2; forward = !forward, i++) { struct xfs_attr3_icleaf_hdr ichdr2; if (forward) blkno = ichdr.forw; else blkno = ichdr.back; if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, state->args->owner, blkno, &bp); if (error) return error; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); bytes = state->args->geo->blksize - (state->args->geo->blksize >> 2) - ichdr.usedbytes - ichdr2.usedbytes - ((ichdr.count + ichdr2.count) * sizeof(xfs_attr_leaf_entry_t)) - xfs_attr3_leaf_hdr_size(leaf); xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { *action = 0; return 0; } /* * Make altpath point to the block we want to keep (the lower * numbered block) and path point to the block we want to drop. */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); } else { error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); } if (error) return error; if (retval) { *action = 0; } else { *action = 1; } return 0; } /* * Remove a name from the leaf attribute list structure. * * Return 1 if leaf is less than 37% full, 0 if >= 37% full. * If two leaves are 37% full, when combined they will leave 25% free. */ int xfs_attr3_leaf_remove( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; int before; int after; int smallest; int entsize; int tablesize; int tmp; int i; trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + xfs_attr3_leaf_hdr_size(leaf)); entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); /* * Scan through free region table: * check for adjacency of free'd entry with an existing one, * find smallest free region in case we need to replace it, * adjust any map that borders the entry table, */ tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf); tmp = ichdr.freemap[0].size; before = after = -1; smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { ASSERT(ichdr.freemap[i].base < args->geo->blksize); ASSERT(ichdr.freemap[i].size < args->geo->blksize); if (ichdr.freemap[i].base == tablesize) { ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); } if (ichdr.freemap[i].base + ichdr.freemap[i].size == be16_to_cpu(entry->nameidx)) { before = i; } else if (ichdr.freemap[i].base == (be16_to_cpu(entry->nameidx) + entsize)) { after = i; } else if (ichdr.freemap[i].size < tmp) { tmp = ichdr.freemap[i].size; smallest = i; } } /* * Coalesce adjacent freemap regions, * or replace the smallest region. */ if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { ichdr.freemap[before].size += entsize; ichdr.freemap[before].size += ichdr.freemap[after].size; ichdr.freemap[after].base = 0; ichdr.freemap[after].size = 0; } else if (before >= 0) { ichdr.freemap[before].size += entsize; } else { ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); ichdr.freemap[after].size += entsize; } } else { /* * Replace smallest region (if it is smaller than free'd entry) */ if (ichdr.freemap[smallest].size < entsize) { ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); ichdr.freemap[smallest].size = entsize; } } /* * Did we remove the first entry? */ if (be16_to_cpu(entry->nameidx) == ichdr.firstused) smallest = 1; else smallest = 0; /* * Compress the remaining entries and zero out the removed stuff. */ memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); ichdr.usedbytes -= entsize; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), entsize)); tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); memmove(entry, entry + 1, tmp); ichdr.count--; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); /* * If we removed the first entry, re-find the first used byte * in the name area. Note that if the entry was the "firstused", * then we don't have a "hole" in our block resulting from * removing the name. */ if (smallest) { tmp = args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf); for (i = ichdr.count - 1; i >= 0; entry++, i--) { ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); } ichdr.firstused = tmp; ASSERT(ichdr.firstused != 0); } else { ichdr.holes = 1; /* mark as needing compaction */ } xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); /* * Check if leaf is less than 50% full, caller may want to * "join" the leaf with a sibling if so. */ tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t); return tmp < args->geo->magicpct; /* leaf is < 37% full */ } /* * Move all the attribute list entries from drop_leaf into save_leaf. */ void xfs_attr3_leaf_unbalance( struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk) { struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; struct xfs_attr3_icleaf_hdr drophdr; struct xfs_attr3_icleaf_hdr savehdr; struct xfs_attr_leaf_entry *entry; trace_xfs_attr_leaf_unbalance(state->args); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); /* * Save last hashval from dying block for later Btree fixup. */ drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); /* * Check if we need a temp buffer, or can we do it in place. * Note that we don't check "leaf" for holes because we will * always be dropping it, toosmall() decided that for us already. */ if (savehdr.holes == 0) { /* * dest leaf has no holes, so we add there. May need * to make some room in the entry array. */ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, save_leaf, &savehdr, 0, drophdr.count); } else { xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, save_leaf, &savehdr, savehdr.count, drophdr.count); } } else { /* * Destination has holes, so we make a temporary copy * of the leaf and add them both to that. */ struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; tmp_leaf = kvzalloc(state->args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); /* * Copy the header into the temp leaf so that all the stuff * not in the incore header is present and gets copied back in * once we've moved all the entries. */ memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); memset(&tmphdr, 0, sizeof(tmphdr)); tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, 0, drophdr.count); xfs_attr3_leaf_moveents(state->args, save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, tmphdr.count, savehdr.count); } else { xfs_attr3_leaf_moveents(state->args, save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, 0, savehdr.count); xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, tmphdr.count, drophdr.count); } memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ kvfree(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. */ entry = xfs_attr3_leaf_entryp(save_leaf); save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); } /*======================================================================== * Routines used for finding things in the Btree. *========================================================================*/ /* * Look up a name in a leaf attribute list structure. * This is the internal routine, it uses the caller's buffer. * * Note that duplicate keys are allowed, but only check within the * current leaf node. The Btree code must check in adjacent leaf nodes. * * Return in args->index the index into the entry[] array of either * the found entry, or where the entry should have been (insert before * that entry). * * Don't change the args->value unless we find the attribute. */ int xfs_attr3_leaf_lookup_int( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; xfs_dahash_t hashval; int probe; int span; trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { xfs_buf_mark_corrupt(bp); xfs_da_mark_sick(args); return -EFSCORRUPTED; } /* * Binary search. (note: small blocks will skip this loop) */ hashval = args->hashval; probe = span = ichdr.count / 2; for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { span /= 2; if (be32_to_cpu(entry->hashval) < hashval) probe += span; else if (be32_to_cpu(entry->hashval) > hashval) probe -= span; else break; } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { xfs_buf_mark_corrupt(bp); xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { xfs_buf_mark_corrupt(bp); xfs_da_mark_sick(args); return -EFSCORRUPTED; } /* * Since we may have duplicate hashval's, find the first matching * hashval in the leaf. */ while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { entry--; probe--; } while (probe < ichdr.count && be32_to_cpu(entry->hashval) < hashval) { entry++; probe++; } if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; return -ENOATTR; } /* * Duplicate keys may be present, so search all of them for a match. */ for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); entry++, probe++) { /* * GROT: Add code to remove incomplete entries. */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); if (!xfs_attr_match(args, entry->flags, name_loc->nameval, name_loc->namelen, &name_loc->nameval[name_loc->namelen], be16_to_cpu(name_loc->valuelen))) continue; args->index = probe; return -EEXIST; } else { unsigned int valuelen; name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); valuelen = be32_to_cpu(name_rmt->valuelen); if (!xfs_attr_match(args, entry->flags, name_rmt->name, name_rmt->namelen, NULL, valuelen)) continue; args->index = probe; args->rmtvaluelen = valuelen; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, args->rmtvaluelen); return -EEXIST; } } args->index = probe; return -ENOATTR; } /* * Get the value associated with an attribute name from a leaf attribute * list structure. * * If args->valuelen is zero, only the length needs to be returned. Unlike a * lookup, we only return an error if the attribute does not exist or we can't * retrieve the value. */ int xfs_attr3_leaf_getvalue( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); return xfs_attr_copy_value(args, &name_loc->nameval[args->namelen], be16_to_cpu(name_loc->valuelen)); } name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, args->rmtvaluelen); return xfs_attr_copy_value(args, NULL, args->rmtvaluelen); } /*======================================================================== * Utility routines. *========================================================================*/ /* * Move the indicated entries from one leaf to another. * NOTE: this routine modifies both source and destination leaves. */ /*ARGSUSED*/ STATIC void xfs_attr3_leaf_moveents( struct xfs_da_args *args, struct xfs_attr_leafblock *leaf_s, struct xfs_attr3_icleaf_hdr *ichdr_s, int start_s, struct xfs_attr_leafblock *leaf_d, struct xfs_attr3_icleaf_hdr *ichdr_d, int start_d, int count) { struct xfs_attr_leaf_entry *entry_s; struct xfs_attr_leaf_entry *entry_d; int desti; int tmp; int i; /* * Check for nothing to do. */ if (count == 0) return; /* * Set up environment. */ ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); ASSERT(ichdr_s->magic == ichdr_d->magic); ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + xfs_attr3_leaf_hdr_size(leaf_s)); ASSERT(ichdr_d->count < args->geo->blksize / 8); ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + xfs_attr3_leaf_hdr_size(leaf_d)); ASSERT(start_s < ichdr_s->count); ASSERT(start_d <= ichdr_d->count); ASSERT(count <= ichdr_s->count); /* * Move the entries in the destination leaf up to make a hole? */ if (start_d < ichdr_d->count) { tmp = ichdr_d->count - start_d; tmp *= sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; memmove(entry_d, entry_s, tmp); } /* * Copy all entry's in the same (sorted) order, * but allocate attribute info packed and in sequence. */ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; desti = start_d; for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); #ifdef GROT /* * Code to drop INCOMPLETE entries. Difficult to use as we * may also need to change the insertion index. Code turned * off for 6.2, should be revisited later. */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); ichdr_s->usedbytes -= tmp; ichdr_s->count -= 1; entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ ichdr_d->firstused -= tmp; /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp <= args->geo->blksize); memmove(xfs_attr3_leaf_name(leaf_d, desti), xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= args->geo->blksize); memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); ichdr_s->usedbytes -= tmp; ichdr_d->usedbytes += tmp; ichdr_s->count -= 1; ichdr_d->count += 1; tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf_d); ASSERT(ichdr_d->firstused >= tmp); #ifdef GROT } #endif /* GROT */ } /* * Zero out the entries we just copied. */ if (start_s == ichdr_s->count) { tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } else { /* * Move the remaining entries down to fill the hole, * then zero the entries at the top. */ tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; memmove(entry_d, entry_s, tmp); tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } /* * Fill in the freemap information */ ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; ichdr_d->freemap[1].base = 0; ichdr_d->freemap[2].base = 0; ichdr_d->freemap[1].size = 0; ichdr_d->freemap[2].size = 0; ichdr_s->holes = 1; /* leaf may not be compact */ } /* * Pick up the last hashvalue from a leaf block. */ xfs_dahash_t xfs_attr_leaf_lasthash( struct xfs_buf *bp, int *count) { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; struct xfs_mount *mp = bp->b_mount; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) *count = ichdr.count; if (!ichdr.count) return 0; return be32_to_cpu(entries[ichdr.count - 1].hashval); } /* * Calculate the number of bytes used to store the indicated attribute * (whether local or remote only calculate bytes in this block). */ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { struct xfs_attr_leaf_entry *entries; xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; int size; entries = xfs_attr3_leaf_entryp(leaf); if (entries[index].flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, index); size = xfs_attr_leaf_entsize_local(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, index); size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); } return size; } /* * Calculate the number of bytes that would be required to store the new * attribute (whether local or remote only calculate bytes in this block). * This routine decides as a side effect whether the attribute will be * a "local" or a "remote" attribute. */ int xfs_attr_leaf_newentsize( struct xfs_da_args *args, int *local) { int size; size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { if (local) *local = 1; return size; } if (local) *local = 0; return xfs_attr_leaf_entsize_remote(args->namelen); } /*======================================================================== * Manage the INCOMPLETE flag in a leaf entry *========================================================================*/ /* * Clear the INCOMPLETE flag on an entry in a leaf block. */ int xfs_attr3_leaf_clearflag( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_buf *bp; int error; #ifdef DEBUG struct xfs_attr3_icleaf_hdr ichdr; xfs_attr_leaf_name_local_t *name_loc; int namelen; char *name; #endif /* DEBUG */ trace_xfs_attr_leaf_clearflag(args); /* * Set up the operation. */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno, &bp); if (error) return error; leaf = bp->b_addr; entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, args->index); namelen = name_loc->namelen; name = (char *)name_loc->nameval; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); namelen = name_rmt->namelen; name = (char *)name_rmt->name; } ASSERT(be32_to_cpu(entry->hashval) == args->hashval); ASSERT(namelen == args->namelen); ASSERT(memcmp(name, args->name, namelen) == 0); #endif /* DEBUG */ entry->flags &= ~XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if (args->rmtblkno) { ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } return 0; } /* * Set the INCOMPLETE flag on an entry in a leaf block. */ int xfs_attr3_leaf_setflag( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_buf *bp; int error; #ifdef DEBUG struct xfs_attr3_icleaf_hdr ichdr; #endif trace_xfs_attr_leaf_setflag(args); /* * Set up the operation. */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno, &bp); if (error) return error; leaf = bp->b_addr; #ifdef DEBUG xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); #endif entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } return 0; } /* * In a single transaction, clear the INCOMPLETE flag on the leaf entry * given by args->blkno/index and set the INCOMPLETE flag on the leaf * entry given by args->blkno2/index2. * * Note that they could be in different blocks, or in the same block. */ int xfs_attr3_leaf_flipflags( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf1; struct xfs_attr_leafblock *leaf2; struct xfs_attr_leaf_entry *entry1; struct xfs_attr_leaf_entry *entry2; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_buf *bp1; struct xfs_buf *bp2; int error; #ifdef DEBUG struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; xfs_attr_leaf_name_local_t *name_loc; int namelen1, namelen2; char *name1, *name2; #endif /* DEBUG */ trace_xfs_attr_leaf_flipflags(args); /* * Read the block containing the "old" attr */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno, &bp1); if (error) return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno2, &bp2); if (error) return error; } else { bp2 = bp1; } leaf1 = bp1->b_addr; entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; leaf2 = bp2->b_addr; entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); ASSERT(args->index < ichdr1.count); ASSERT(args->index >= 0); xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); ASSERT(args->index2 < ichdr2.count); ASSERT(args->index2 >= 0); if (entry1->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); namelen1 = name_loc->namelen; name1 = (char *)name_loc->nameval; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); namelen1 = name_rmt->namelen; name1 = (char *)name_rmt->name; } if (entry2->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); namelen2 = name_loc->namelen; name2 = (char *)name_loc->nameval; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); namelen2 = name_rmt->namelen; name2 = (char *)name_rmt->name; } ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval)); ASSERT(namelen1 == namelen2); ASSERT(memcmp(name1, name2, namelen1) == 0); #endif /* DEBUG */ ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE); ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); entry1->flags &= ~XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } entry2->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } return 0; }
115 5 111 115 82 77 4 4 1 41 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto_tcp.c: TCP load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * tcp_timeouts table has copy per netns in a hash table per * protocol ip_vs_proto_data and is handled by netns */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; __be16 _ports[2], *ports = NULL; /* In the event of icmp, we're only guaranteed to have the first 8 * bytes of the transport header, so we only check the rest of the * TCP packet for non-ICMP packets */ if (likely(!ip_vs_iph_icmp(iph))) { th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th) { if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) return 1; ports = &th->source; } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static inline void tcp_fast_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); else #endif tcph->check = csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); } static inline void tcp_partial_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldlen, __be16 newlen) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); else #endif tcph->check = ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); } INDIRECT_CALLABLE_SCOPE int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->vaddr.in6, &cp->caddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, (char*)&(tcph->check) - (char*)tcph); } return 1; } static int tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->dest = cp->dport; /* * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->caddr.in6, &cp->daddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int tcphoff; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcphoff = sizeof(struct ipv6hdr); else #endif tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } } else #endif if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } break; default: /* No need to checksum. */ break; } return 1; } #define TCP_DIR_INPUT 0 #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, }; /* * Timeout table[state] */ static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, [IP_VS_TCP_S_CLOSE] = 10*HZ, [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, [IP_VS_TCP_S_LAST_ACK] = 30*HZ, [IP_VS_TCP_S_LISTEN] = 2*60*HZ, [IP_VS_TCP_S_SYNACK] = 120*HZ, [IP_VS_TCP_S_LAST] = 2*HZ, }; static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", [IP_VS_TCP_S_CLOSE] = "CLOSE", [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", [IP_VS_TCP_S_LISTEN] = "LISTEN", [IP_VS_TCP_S_SYNACK] = "SYNACK", [IP_VS_TCP_S_LAST] = "BUG!", }; static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { [IP_VS_TCP_S_NONE] = false, [IP_VS_TCP_S_ESTABLISHED] = true, [IP_VS_TCP_S_SYN_SENT] = true, [IP_VS_TCP_S_SYN_RECV] = true, [IP_VS_TCP_S_FIN_WAIT] = false, [IP_VS_TCP_S_TIME_WAIT] = false, [IP_VS_TCP_S_CLOSE] = false, [IP_VS_TCP_S_CLOSE_WAIT] = false, [IP_VS_TCP_S_LAST_ACK] = false, [IP_VS_TCP_S_LISTEN] = false, [IP_VS_TCP_S_SYNACK] = true, }; #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT #define sSR IP_VS_TCP_S_SYN_RECV #define sFW IP_VS_TCP_S_FIN_WAIT #define sTW IP_VS_TCP_S_TIME_WAIT #define sCL IP_VS_TCP_S_CLOSE #define sCW IP_VS_TCP_S_CLOSE_WAIT #define sLA IP_VS_TCP_S_LAST_ACK #define sLI IP_VS_TCP_S_LISTEN #define sSA IP_VS_TCP_S_SYNACK struct tcp_states_t { int next_state[IP_VS_TCP_S_LAST]; }; static const char * tcp_state_name(int state) { if (state >= IP_VS_TCP_S_LAST) return "ERR!"; return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; } static bool tcp_state_active(int state) { if (state >= IP_VS_TCP_S_LAST) return false; return tcp_state_active_table[state]; } static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ /* ** FIXME: change secure_tcp to independent sysctl var ** or make it per-service or per-app because it is valid ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) { if (th->rst) return 3; if (th->syn) return 0; if (th->fin) return 1; if (th->ack) return 2; return -1; } static inline void set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; int new_state = IP_VS_TCP_S_CLOSE; int state_off = tcp_state_off[direction]; /* * Update state offset to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (state_off == TCP_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else state_off = TCP_DIR_INPUT_ONLY; } if ((state_idx = tcp_state_idx(th)) < 0) { IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); goto tcp_state_out; } new_state = pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && !tcp_state_active(new_state)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && tcp_state_active(new_state)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (new_state == IP_VS_TCP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = new_state]; else /* What to do ? */ cp->timeout = tcp_timeouts[cp->state = new_state]; } /* * Handle state transitions */ static void tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; #ifdef CONFIG_IP_VS_IPV6 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else int ihl = ip_hdrlen(skb); #endif th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) return; spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) & TCP_APP_TAB_MASK; } static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); spin_unlock_bh(&cp->lock); } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); if (!pd->timeout_table) return -ENOMEM; pd->tcp_state_table = tcp_states; return 0; } static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_tcp = { .name = "TCP", .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_tcp_init, .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .state_name = tcp_state_name, .state_transition = tcp_state_transition, .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, };
29 23 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 #ifndef LLC_H #define LLC_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rculist_nulls.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/atomic.h> struct net_device; struct packet_type; struct sk_buff; struct llc_addr { unsigned char lsap; unsigned char mac[IFHWADDRLEN]; }; #define LLC_SAP_STATE_INACTIVE 1 #define LLC_SAP_STATE_ACTIVE 2 #define LLC_SK_DEV_HASH_BITS 6 #define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS) #define LLC_SK_LADDR_HASH_BITS 6 #define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS) /** * struct llc_sap - Defines the SAP component * * @station - station this sap belongs to * @state - sap state * @p_bit - only lowest-order bit used * @f_bit - only lowest-order bit used * @laddr - SAP value in this 'lsap' * @node - entry in station sap_list * @sk_list - LLC sockets this one manages */ struct llc_sap { unsigned char state; unsigned char p_bit; unsigned char f_bit; refcount_t refcnt; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; spinlock_t sk_lock; int sk_count; struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES]; struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES]; struct rcu_head rcu; }; static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); return &sap->sk_dev_hash[bucket]; } static inline u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr) { return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0), LLC_SK_LADDR_HASH_BITS); } static inline struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap, const struct llc_addr *laddr) { return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)]; } #define LLC_DEST_INVALID 0 /* Invalid LLC PDU type */ #define LLC_DEST_SAP 1 /* Type 1 goes here */ #define LLC_DEST_CONN 2 /* Type 2 goes here */ extern struct list_head llc_sap_list; int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)); void llc_remove_pack(int type); void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)); static inline void llc_sap_hold(struct llc_sap *sap) { refcount_inc(&sap->refcnt); } static inline bool llc_sap_hold_safe(struct llc_sap *sap) { return refcount_inc_not_zero(&sap->refcnt); } void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) { if (refcount_dec_and_test(&sap->refcnt)) llc_sap_close(sap); } struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_station_init(void); void llc_station_exit(void); #ifdef CONFIG_PROC_FS int llc_proc_init(void); void llc_proc_exit(void); #else #define llc_proc_init() (0) #define llc_proc_exit() do { } while(0) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL int llc_sysctl_init(void); void llc_sysctl_exit(void); extern int sysctl_llc2_ack_timeout; extern int sysctl_llc2_busy_timeout; extern int sysctl_llc2_p_timeout; extern int sysctl_llc2_rej_timeout; #else #define llc_sysctl_init() (0) #define llc_sysctl_exit() do { } while(0) #endif /* CONFIG_SYSCTL */ #endif /* LLC_H */
22 13 9 15 9 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 // SPDX-License-Identifier: GPL-2.0 /* dvb-usb-i2c.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file contains functions for (de-)initializing an I2C adapter. */ #include "dvb-usb-common.h" int dvb_usb_i2c_init(struct dvb_usb_device *d) { int ret = 0; if (!(d->props.caps & DVB_USB_IS_AN_I2C_ADAPTER)) return 0; if (d->props.i2c_algo == NULL) { err("no i2c algorithm specified"); ret = -EINVAL; goto err; } strscpy(d->i2c_adap.name, d->desc->name, sizeof(d->i2c_adap.name)); d->i2c_adap.algo = d->props.i2c_algo; d->i2c_adap.algo_data = NULL; d->i2c_adap.dev.parent = &d->udev->dev; i2c_set_adapdata(&d->i2c_adap, d); ret = i2c_add_adapter(&d->i2c_adap); if (ret < 0) { err("could not add i2c adapter"); goto err; } d->state |= DVB_USB_STATE_I2C; err: return ret; } int dvb_usb_i2c_exit(struct dvb_usb_device *d) { if (d->state & DVB_USB_STATE_I2C) i2c_del_adapter(&d->i2c_adap); d->state &= ~DVB_USB_STATE_I2C; return 0; }
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008-2010, 2013 Dave Chinner * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ialloc.h" #include "xfs_trace.h" struct kmem_cache *xfs_icreate_cache; /* inode create item */ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_icreate_item, ic_item); } /* * This returns the number of iovecs needed to log the given inode item. * * We only need one iovec for the icreate log structure. */ STATIC void xfs_icreate_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { *nvecs += 1; *nbytes += sizeof(struct xfs_icreate_log); } /* * This is called to fill in the vector of log iovecs for the * given inode create log item. */ STATIC void xfs_icreate_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_icreate_item *icp = ICR_ITEM(lip); struct xfs_log_iovec *vecp = NULL; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, &icp->ic_format, sizeof(struct xfs_icreate_log)); } STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } static const struct xfs_item_ops xfs_icreate_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, .iop_release = xfs_icreate_item_release, }; /* * Initialize the inode log item for a newly allocated (in-core) inode. * * Inode extents can only reside within an AG. Hence specify the starting * block for the inode chunk by offset within an AG as well as the * length of the allocated extent. * * This joins the item to the transaction and marks it dirty so * that we don't need a separate call to do this, nor does the * caller need to know anything about the icreate item. */ void xfs_icreate_log( struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, unsigned int count, unsigned int inode_size, xfs_agblock_t length, unsigned int generation) { struct xfs_icreate_item *icp; icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); icp->ic_format.icl_type = XFS_LI_ICREATE; icp->ic_format.icl_size = 1; /* single vector */ icp->ic_format.icl_ag = cpu_to_be32(agno); icp->ic_format.icl_agbno = cpu_to_be32(agbno); icp->ic_format.icl_count = cpu_to_be32(count); icp->ic_format.icl_isize = cpu_to_be32(inode_size); icp->ic_format.icl_length = cpu_to_be32(length); icp->ic_format.icl_gen = cpu_to_be32(generation); xfs_trans_add_item(tp, &icp->ic_item); tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } static enum xlog_recover_reorder xlog_recover_icreate_reorder( struct xlog_recover_item *item) { /* * Inode allocation buffers must be replayed before subsequent inode * items try to modify those buffers. ICREATE items are the logical * equivalent of logging a newly initialized inode buffer, so recover * these at the same time that we recover logged buffers. */ return XLOG_REORDER_BUFFER_LIST; } /* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that * match the range to be initialised, stamped with inode templates and written * by delayed write so that subsequent modifications will hit the cached buffer * and only need writing out at the end of recovery. */ STATIC int xlog_recover_icreate_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_mount *mp = log->l_mp; struct xfs_icreate_log *icl; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agnumber_t agno; xfs_agblock_t agbno; unsigned int count; unsigned int isize; xfs_agblock_t length; int bb_per_cluster; int cancel_count; int nbufs; int i; icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; if (icl->icl_type != XFS_LI_ICREATE) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); return -EINVAL; } if (icl->icl_size != 1) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); return -EINVAL; } agno = be32_to_cpu(icl->icl_ag); if (agno >= mp->m_sb.sb_agcount) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); return -EINVAL; } agbno = be32_to_cpu(icl->icl_agbno); if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); return -EINVAL; } isize = be32_to_cpu(icl->icl_isize); if (isize != mp->m_sb.sb_inodesize) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); return -EINVAL; } count = be32_to_cpu(icl->icl_count); if (!count) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); return -EINVAL; } length = be32_to_cpu(icl->icl_length); if (!length || length >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); return -EINVAL; } /* * The inode chunk is either full or sparse and we only support * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. */ if (length != igeo->ialloc_blks && length != igeo->ialloc_min_blks) { xfs_warn(log->l_mp, "%s: unsupported chunk length", __func__); return -EINVAL; } /* verify inode count is consistent with extent length */ if ((count >> mp->m_sb.sb_inopblog) != length) { xfs_warn(log->l_mp, "%s: inconsistent inode count and chunk length", __func__); return -EINVAL; } /* * The icreate transaction can cover multiple cluster buffers and these * buffers could have been freed and reused. Check the individual * buffers for cancellation so we don't overwrite anything written after * a cancellation. */ bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); nbufs = length / igeo->blocks_per_cluster; for (i = 0, cancel_count = 0; i < nbufs; i++) { xfs_daddr_t daddr; daddr = XFS_AGB_TO_DADDR(mp, agno, agbno + i * igeo->blocks_per_cluster); if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster)) cancel_count++; } /* * We currently only use icreate for a single allocation at a time. This * means we should expect either all or none of the buffers to be * cancelled. Be conservative and skip replay if at least one buffer is * cancelled, but warn the user that something is awry if the buffers * are not consistent. * * XXX: This must be refined to only skip cancelled clusters once we use * icreate for multiple chunk allocations. */ ASSERT(!cancel_count || cancel_count == nbufs); if (cancel_count) { if (cancel_count != nbufs) xfs_warn(mp, "WARNING: partial inode chunk cancellation, skipped icreate."); trace_xfs_log_recover_icreate_cancel(log, icl); return 0; } trace_xfs_log_recover_icreate_recover(log, icl); return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, be32_to_cpu(icl->icl_gen)); } const struct xlog_recover_item_ops xlog_icreate_item_ops = { .item_type = XFS_LI_ICREATE, .reorder = xlog_recover_icreate_reorder, .commit_pass2 = xlog_recover_icreate_commit_pass2, };
364 462 167 65 183 182 80 80 589 500 8 1 8 8 8 19 20 12 20 25 32 25 32 12 12 3 2 2 3 2 3 2 3 3 3 1 2 3 3 1 2 3 3 3 3 3 79 3 82 81 1 80 80 81 81 80 81 79 80 78 79 81 81 190 190 8 8 16 203 190 191 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 Jens Axboe <jens.axboe@oracle.com> * * Scatterlist handling helpers. */ #include <linux/export.h> #include <linux/slab.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <linux/kmemleak.h> #include <linux/bvec.h> #include <linux/uio.h> #include <linux/folio_queue.h> /** * sg_next - return the next scatterlist entry in a list * @sg: The current sg entry * * Description: * Usually the next entry will be @sg@ + 1, but if this sg element is part * of a chained scatterlist, it could jump to the start of a new * scatterlist array. * **/ struct scatterlist *sg_next(struct scatterlist *sg) { if (sg_is_last(sg)) return NULL; sg++; if (unlikely(sg_is_chain(sg))) sg = sg_chain_ptr(sg); return sg; } EXPORT_SYMBOL(sg_next); /** * sg_nents - return total count of entries in scatterlist * @sg: The scatterlist * * Description: * Allows to know how many entries are in sg, taking into account * chaining as well * **/ int sg_nents(struct scatterlist *sg) { int nents; for (nents = 0; sg; sg = sg_next(sg)) nents++; return nents; } EXPORT_SYMBOL(sg_nents); /** * sg_nents_for_len - return total count of entries in scatterlist * needed to satisfy the supplied length * @sg: The scatterlist * @len: The total required length * * Description: * Determines the number of entries in sg that are required to meet * the supplied length, taking into account chaining as well * * Returns: * the number of sg entries needed, negative error on failure * **/ int sg_nents_for_len(struct scatterlist *sg, u64 len) { int nents; u64 total; if (!len) return 0; for (nents = 0, total = 0; sg; sg = sg_next(sg)) { nents++; total += sg->length; if (total >= len) return nents; } return -EINVAL; } EXPORT_SYMBOL(sg_nents_for_len); /** * sg_last - return the last scatterlist entry in a list * @sgl: First entry in the scatterlist * @nents: Number of entries in the scatterlist * * Description: * Should only be used casually, it (currently) scans the entire list * to get the last entry. * * Note that the @sgl@ pointer passed in need not be the first one, * the important bit is that @nents@ denotes the number of entries that * exist from @sgl@. * **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) { struct scatterlist *sg, *ret = NULL; unsigned int i; for_each_sg(sgl, sg, nents, i) ret = sg; BUG_ON(!sg_is_last(ret)); return ret; } EXPORT_SYMBOL(sg_last); /** * sg_init_table - Initialize SG table * @sgl: The SG table * @nents: Number of entries in table * * Notes: * If this is part of a chained sg table, sg_mark_end() should be * used only on the last table part. * **/ void sg_init_table(struct scatterlist *sgl, unsigned int nents) { memset(sgl, 0, sizeof(*sgl) * nents); sg_init_marker(sgl, nents); } EXPORT_SYMBOL(sg_init_table); /** * sg_init_one - Initialize a single entry sg list * @sg: SG entry * @buf: Virtual address for IO * @buflen: IO length * **/ void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen) { sg_init_table(sg, 1); sg_set_buf(sg, buf, buflen); } EXPORT_SYMBOL(sg_init_one); /* * The default behaviour of sg_alloc_table() is to use these kmalloc/kfree * helpers. */ static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask) { if (nents == SG_MAX_SINGLE_ALLOC) { /* * Kmemleak doesn't track page allocations as they are not * commonly used (in a raw form) for kernel data structures. * As we chain together a list of pages and then a normal * kmalloc (tracked by kmemleak), in order to for that last * allocation not to become decoupled (and thus a * false-positive) we need to inform kmemleak of all the * intermediate allocations. */ void *ptr = (void *) __get_free_page(gfp_mask); kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask); return ptr; } else return kmalloc_array(nents, sizeof(struct scatterlist), gfp_mask); } static void sg_kfree(struct scatterlist *sg, unsigned int nents) { if (nents == SG_MAX_SINGLE_ALLOC) { kmemleak_free(sg); free_page((unsigned long) sg); } else kfree(sg); } /** * __sg_free_table - Free a previously mapped sg table * @table: The sg table header to use * @max_ents: The maximum number of entries per single scatterlist * @nents_first_chunk: Number of entries int the (preallocated) first * scatterlist chunk, 0 means no such preallocated first chunk * @free_fn: Free function * @num_ents: Number of entries in the table * * Description: * Free an sg table previously allocated and setup with * __sg_alloc_table(). The @max_ents value must be identical to * that previously used with __sg_alloc_table(). * **/ void __sg_free_table(struct sg_table *table, unsigned int max_ents, unsigned int nents_first_chunk, sg_free_fn *free_fn, unsigned int num_ents) { struct scatterlist *sgl, *next; unsigned curr_max_ents = nents_first_chunk ?: max_ents; if (unlikely(!table->sgl)) return; sgl = table->sgl; while (num_ents) { unsigned int alloc_size = num_ents; unsigned int sg_size; /* * If we have more than max_ents segments left, * then assign 'next' to the sg table after the current one. * sg_size is then one less than alloc size, since the last * element is the chain pointer. */ if (alloc_size > curr_max_ents) { next = sg_chain_ptr(&sgl[curr_max_ents - 1]); alloc_size = curr_max_ents; sg_size = alloc_size - 1; } else { sg_size = alloc_size; next = NULL; } num_ents -= sg_size; if (nents_first_chunk) nents_first_chunk = 0; else free_fn(sgl, alloc_size); sgl = next; curr_max_ents = max_ents; } table->sgl = NULL; } EXPORT_SYMBOL(__sg_free_table); /** * sg_free_append_table - Free a previously allocated append sg table. * @table: The mapped sg append table header * **/ void sg_free_append_table(struct sg_append_table *table) { __sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->total_nents); } EXPORT_SYMBOL(sg_free_append_table); /** * sg_free_table - Free a previously allocated sg table * @table: The mapped sg table header * **/ void sg_free_table(struct sg_table *table) { __sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->orig_nents); } EXPORT_SYMBOL(sg_free_table); /** * __sg_alloc_table - Allocate and initialize an sg table with given allocator * @table: The sg table header to use * @nents: Number of entries in sg list * @max_ents: The maximum number of entries the allocator returns per call * @first_chunk: first SGL if preallocated (may be %NULL) * @nents_first_chunk: Number of entries in the (preallocated) first * scatterlist chunk, 0 means no such preallocated chunk provided by user * @gfp_mask: GFP allocation mask * @alloc_fn: Allocator to use * * Description: * This function returns a @table @nents long. The allocator is * defined to return scatterlist chunks of maximum size @max_ents. * Thus if @nents is bigger than @max_ents, the scatterlists will be * chained in units of @max_ents. * * Notes: * If this function returns non-0 (eg failure), the caller must call * __sg_free_table() to cleanup any leftover allocations. * **/ int __sg_alloc_table(struct sg_table *table, unsigned int nents, unsigned int max_ents, struct scatterlist *first_chunk, unsigned int nents_first_chunk, gfp_t gfp_mask, sg_alloc_fn *alloc_fn) { struct scatterlist *sg, *prv; unsigned int left; unsigned curr_max_ents = nents_first_chunk ?: max_ents; unsigned prv_max_ents; memset(table, 0, sizeof(*table)); if (nents == 0) return -EINVAL; #ifdef CONFIG_ARCH_NO_SG_CHAIN if (WARN_ON_ONCE(nents > max_ents)) return -EINVAL; #endif left = nents; prv = NULL; do { unsigned int sg_size, alloc_size = left; if (alloc_size > curr_max_ents) { alloc_size = curr_max_ents; sg_size = alloc_size - 1; } else sg_size = alloc_size; left -= sg_size; if (first_chunk) { sg = first_chunk; first_chunk = NULL; } else { sg = alloc_fn(alloc_size, gfp_mask); } if (unlikely(!sg)) { /* * Adjust entry count to reflect that the last * entry of the previous table won't be used for * linkage. Without this, sg_kfree() may get * confused. */ if (prv) table->nents = ++table->orig_nents; return -ENOMEM; } sg_init_table(sg, alloc_size); table->nents = table->orig_nents += sg_size; /* * If this is the first mapping, assign the sg table header. * If this is not the first mapping, chain previous part. */ if (prv) sg_chain(prv, prv_max_ents, sg); else table->sgl = sg; /* * If no more entries after this one, mark the end */ if (!left) sg_mark_end(&sg[sg_size - 1]); prv = sg; prv_max_ents = curr_max_ents; curr_max_ents = max_ents; } while (left); return 0; } EXPORT_SYMBOL(__sg_alloc_table); /** * sg_alloc_table - Allocate and initialize an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table. If @nents@ is larger than * SG_MAX_SINGLE_ALLOC a chained sg table will be setup. * **/ int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask) { int ret; ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC, NULL, 0, gfp_mask, sg_kmalloc); if (unlikely(ret)) sg_free_table(table); return ret; } EXPORT_SYMBOL(sg_alloc_table); static struct scatterlist *get_next_sg(struct sg_append_table *table, struct scatterlist *cur, unsigned long needed_sges, gfp_t gfp_mask) { struct scatterlist *new_sg, *next_sg; unsigned int alloc_size; if (cur) { next_sg = sg_next(cur); /* Check if last entry should be keeped for chainning */ if (!sg_is_last(next_sg) || needed_sges == 1) return next_sg; } alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC); new_sg = sg_kmalloc(alloc_size, gfp_mask); if (!new_sg) return ERR_PTR(-ENOMEM); sg_init_table(new_sg, alloc_size); if (cur) { table->total_nents += alloc_size - 1; __sg_chain(next_sg, new_sg); } else { table->sgt.sgl = new_sg; table->total_nents = alloc_size; } return new_sg; } static bool pages_are_mergeable(struct page *a, struct page *b) { if (page_to_pfn(a) != page_to_pfn(b) + 1) return false; if (!zone_device_pages_have_same_pgmap(a, b)) return false; return true; } /** * sg_alloc_append_table_from_pages - Allocate and initialize an append sg * table from an array of pages * @sgt_append: The sg append table to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @max_segment: Maximum size of a scatterlist element in bytes * @left_pages: Left pages caller have to set after this call * @gfp_mask: GFP allocation mask * * Description: * In the first call it allocate and initialize an sg table from a list of * pages, else reuse the scatterlist from sgt_append. Contiguous ranges of * the pages are squashed into a single scatterlist entry up to the maximum * size specified in @max_segment. A user may provide an offset at a start * and a size of valid data in a buffer specified by the page array. The * returned sg table is released by sg_free_append_table * * Returns: * 0 on success, negative error on failure * * Notes: * If this function returns non-0 (eg failure), the caller must call * sg_free_append_table() to cleanup any leftover allocations. * * In the fist call, sgt_append must by initialized. */ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, unsigned int left_pages, gfp_t gfp_mask) { unsigned int chunks, cur_page, seg_len, i, prv_len = 0; unsigned int added_nents = 0; struct scatterlist *s = sgt_append->prv; struct page *last_pg; /* * The algorithm below requires max_segment to be aligned to PAGE_SIZE * otherwise it can overshoot. */ max_segment = ALIGN_DOWN(max_segment, PAGE_SIZE); if (WARN_ON(max_segment < PAGE_SIZE)) return -EINVAL; if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && sgt_append->prv) return -EOPNOTSUPP; if (sgt_append->prv) { unsigned long next_pfn; if (WARN_ON(offset)) return -EINVAL; /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; next_pfn = (sg_phys(sgt_append->prv) + prv_len) / PAGE_SIZE; if (page_to_pfn(pages[0]) == next_pfn) { last_pg = pfn_to_page(next_pfn - 1); while (n_pages && pages_are_mergeable(pages[0], last_pg)) { if (sgt_append->prv->length + PAGE_SIZE > max_segment) break; sgt_append->prv->length += PAGE_SIZE; last_pg = pages[0]; pages++; n_pages--; } if (!n_pages) goto out; } } /* compute number of contiguous chunks */ chunks = 1; seg_len = 0; for (i = 1; i < n_pages; i++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || !pages_are_mergeable(pages[i], pages[i - 1])) { chunks++; seg_len = 0; } } /* merging chunks and putting them into the scatterlist */ cur_page = 0; for (i = 0; i < chunks; i++) { unsigned int j, chunk_size; /* look for the end of the current chunk */ seg_len = 0; for (j = cur_page + 1; j < n_pages; j++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || !pages_are_mergeable(pages[j], pages[j - 1])) break; } /* Pass how many chunks might be left */ s = get_next_sg(sgt_append, s, chunks - i + left_pages, gfp_mask); if (IS_ERR(s)) { /* * Adjust entry length to be as before function was * called. */ if (sgt_append->prv) sgt_append->prv->length = prv_len; return PTR_ERR(s); } chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset; sg_set_page(s, pages[cur_page], min_t(unsigned long, size, chunk_size), offset); added_nents++; size -= chunk_size; offset = 0; cur_page = j; } sgt_append->sgt.nents += added_nents; sgt_append->sgt.orig_nents = sgt_append->sgt.nents; sgt_append->prv = s; out: if (!left_pages) sg_mark_end(s); return 0; } EXPORT_SYMBOL(sg_alloc_append_table_from_pages); /** * sg_alloc_table_from_pages_segment - Allocate and initialize an sg table from * an array of pages and given maximum * segment. * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @max_segment: Maximum size of a scatterlist element in bytes * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node up to the * maximum size specified in @max_segment. A user may provide an offset at a * start and a size of valid data in a buffer specified by the page array. * * The returned sg table is released by sg_free_table. * * Returns: * 0 on success, negative error on failure */ int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, gfp_t gfp_mask) { struct sg_append_table append = {}; int err; err = sg_alloc_append_table_from_pages(&append, pages, n_pages, offset, size, max_segment, 0, gfp_mask); if (err) { sg_free_append_table(&append); return err; } memcpy(sgt, &append.sgt, sizeof(*sgt)); WARN_ON(append.total_nents != sgt->orig_nents); return 0; } EXPORT_SYMBOL(sg_alloc_table_from_pages_segment); #ifdef CONFIG_SGL_ALLOC /** * sgl_alloc_order - allocate a scatterlist and its pages * @length: Length in bytes of the scatterlist. Must be at least one * @order: Second argument for alloc_pages() * @chainable: Whether or not to allocate an extra element in the scatterlist * for scatterlist chaining purposes * @gfp: Memory allocation flags * @nent_p: [out] Number of entries in the scatterlist that have pages * * Returns: A pointer to an initialized scatterlist or %NULL upon failure. */ struct scatterlist *sgl_alloc_order(unsigned long long length, unsigned int order, bool chainable, gfp_t gfp, unsigned int *nent_p) { struct scatterlist *sgl, *sg; struct page *page; unsigned int nent, nalloc; u32 elem_len; nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order); /* Check for integer overflow */ if (length > (nent << (PAGE_SHIFT + order))) return NULL; nalloc = nent; if (chainable) { /* Check for integer overflow */ if (nalloc + 1 < nalloc) return NULL; nalloc++; } sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), gfp & ~GFP_DMA); if (!sgl) return NULL; sg_init_table(sgl, nalloc); sg = sgl; while (length) { elem_len = min_t(u64, length, PAGE_SIZE << order); page = alloc_pages(gfp, order); if (!page) { sgl_free_order(sgl, order); return NULL; } sg_set_page(sg, page, elem_len, 0); length -= elem_len; sg = sg_next(sg); } WARN_ONCE(length, "length = %lld\n", length); if (nent_p) *nent_p = nent; return sgl; } EXPORT_SYMBOL(sgl_alloc_order); /** * sgl_alloc - allocate a scatterlist and its pages * @length: Length in bytes of the scatterlist * @gfp: Memory allocation flags * @nent_p: [out] Number of entries in the scatterlist * * Returns: A pointer to an initialized scatterlist or %NULL upon failure. */ struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p) { return sgl_alloc_order(length, 0, false, gfp, nent_p); } EXPORT_SYMBOL(sgl_alloc); /** * sgl_free_n_order - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements * @nents: Maximum number of elements to free * @order: Second argument for __free_pages() * * Notes: * - If several scatterlists have been chained and each chain element is * freed separately then it's essential to set nents correctly to avoid that a * page would get freed twice. * - All pages in a chained scatterlist can be freed at once by setting @nents * to a high number. */ void sgl_free_n_order(struct scatterlist *sgl, int nents, int order) { struct scatterlist *sg; struct page *page; int i; for_each_sg(sgl, sg, nents, i) { if (!sg) break; page = sg_page(sg); if (page) __free_pages(page, order); } kfree(sgl); } EXPORT_SYMBOL(sgl_free_n_order); /** * sgl_free_order - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements * @order: Second argument for __free_pages() */ void sgl_free_order(struct scatterlist *sgl, int order) { sgl_free_n_order(sgl, INT_MAX, order); } EXPORT_SYMBOL(sgl_free_order); /** * sgl_free - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements */ void sgl_free(struct scatterlist *sgl) { sgl_free_order(sgl, 0); } EXPORT_SYMBOL(sgl_free); #endif /* CONFIG_SGL_ALLOC */ void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset) { piter->__pg_advance = 0; piter->__nents = nents; piter->sg = sglist; piter->sg_pgoffset = pgoffset; } EXPORT_SYMBOL(__sg_page_iter_start); static int sg_page_count(struct scatterlist *sg) { return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT; } bool __sg_page_iter_next(struct sg_page_iter *piter) { if (!piter->__nents || !piter->sg) return false; piter->sg_pgoffset += piter->__pg_advance; piter->__pg_advance = 1; while (piter->sg_pgoffset >= sg_page_count(piter->sg)) { piter->sg_pgoffset -= sg_page_count(piter->sg); piter->sg = sg_next(piter->sg); if (!--piter->__nents || !piter->sg) return false; } return true; } EXPORT_SYMBOL(__sg_page_iter_next); static int sg_dma_page_count(struct scatterlist *sg) { return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT; } bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter) { struct sg_page_iter *piter = &dma_iter->base; if (!piter->__nents || !piter->sg) return false; piter->sg_pgoffset += piter->__pg_advance; piter->__pg_advance = 1; while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) { piter->sg_pgoffset -= sg_dma_page_count(piter->sg); piter->sg = sg_next(piter->sg); if (!--piter->__nents || !piter->sg) return false; } return true; } EXPORT_SYMBOL(__sg_page_iter_dma_next); /** * sg_miter_start - start mapping iteration over a sg list * @miter: sg mapping iter to be started * @sgl: sg list to iterate over * @nents: number of sg entries * @flags: sg iterator flags * * Description: * Starts mapping iterator @miter. * * Context: * Don't care. */ void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, unsigned int nents, unsigned int flags) { memset(miter, 0, sizeof(struct sg_mapping_iter)); __sg_page_iter_start(&miter->piter, sgl, nents, 0); WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG))); miter->__flags = flags; } EXPORT_SYMBOL(sg_miter_start); static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) { if (!miter->__remaining) { struct scatterlist *sg; if (!__sg_page_iter_next(&miter->piter)) return false; sg = miter->piter.sg; miter->__offset = miter->piter.sg_pgoffset ? 0 : sg->offset; miter->piter.sg_pgoffset += miter->__offset >> PAGE_SHIFT; miter->__offset &= PAGE_SIZE - 1; miter->__remaining = sg->offset + sg->length - (miter->piter.sg_pgoffset << PAGE_SHIFT) - miter->__offset; miter->__remaining = min_t(unsigned long, miter->__remaining, PAGE_SIZE - miter->__offset); } return true; } /** * sg_miter_skip - reposition mapping iterator * @miter: sg mapping iter to be skipped * @offset: number of bytes to plus the current location * * Description: * Sets the offset of @miter to its current location plus @offset bytes. * If mapping iterator @miter has been proceeded by sg_miter_next(), this * stops @miter. * * Context: * Don't care. * * Returns: * true if @miter contains the valid mapping. false if end of sg * list is reached. */ bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset) { sg_miter_stop(miter); while (offset) { off_t consumed; if (!sg_miter_get_next_page(miter)) return false; consumed = min_t(off_t, offset, miter->__remaining); miter->__offset += consumed; miter->__remaining -= consumed; offset -= consumed; } return true; } EXPORT_SYMBOL(sg_miter_skip); /** * sg_miter_next - proceed mapping iterator to the next mapping * @miter: sg mapping iter to proceed * * Description: * Proceeds @miter to the next mapping. @miter should have been started * using sg_miter_start(). On successful return, @miter->page, * @miter->addr and @miter->length point to the current mapping. * * Context: * May sleep if !SG_MITER_ATOMIC. * * Returns: * true if @miter contains the next mapping. false if end of sg * list is reached. */ bool sg_miter_next(struct sg_mapping_iter *miter) { sg_miter_stop(miter); /* * Get to the next page if necessary. * __remaining, __offset is adjusted by sg_miter_stop */ if (!sg_miter_get_next_page(miter)) return false; miter->page = sg_page_iter_page(&miter->piter); miter->consumed = miter->length = miter->__remaining; if (miter->__flags & SG_MITER_ATOMIC) miter->addr = kmap_atomic(miter->page) + miter->__offset; else miter->addr = kmap(miter->page) + miter->__offset; return true; } EXPORT_SYMBOL(sg_miter_next); /** * sg_miter_stop - stop mapping iteration * @miter: sg mapping iter to be stopped * * Description: * Stops mapping iterator @miter. @miter should have been started * using sg_miter_start(). A stopped iteration can be resumed by * calling sg_miter_next() on it. This is useful when resources (kmap) * need to be released during iteration. * * Context: * Don't care otherwise. */ void sg_miter_stop(struct sg_mapping_iter *miter) { WARN_ON(miter->consumed > miter->length); /* drop resources from the last iteration */ if (miter->addr) { miter->__offset += miter->consumed; miter->__remaining -= miter->consumed; if (miter->__flags & SG_MITER_TO_SG) flush_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { WARN_ON_ONCE(!pagefault_disabled()); kunmap_atomic(miter->addr); } else kunmap(miter->page); miter->page = NULL; miter->addr = NULL; miter->length = 0; miter->consumed = 0; } } EXPORT_SYMBOL(sg_miter_stop); /** * sg_copy_buffer - Copy data between a linear buffer and an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * @to_buffer: transfer direction (true == from an sg list to a * buffer, false == from a buffer to an sg list) * * Returns the number of copied bytes. * **/ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer) { unsigned int offset = 0; struct sg_mapping_iter miter; unsigned int sg_flags = SG_MITER_ATOMIC; if (to_buffer) sg_flags |= SG_MITER_FROM_SG; else sg_flags |= SG_MITER_TO_SG; sg_miter_start(&miter, sgl, nents, sg_flags); if (!sg_miter_skip(&miter, skip)) return 0; while ((offset < buflen) && sg_miter_next(&miter)) { unsigned int len; len = min(miter.length, buflen - offset); if (to_buffer) memcpy(buf + offset, miter.addr, len); else memcpy(miter.addr, buf + offset, len); offset += len; } sg_miter_stop(&miter); return offset; } EXPORT_SYMBOL(sg_copy_buffer); /** * sg_copy_from_buffer - Copy from a linear buffer to an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * * Returns the number of copied bytes. * **/ size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen) { return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false); } EXPORT_SYMBOL(sg_copy_from_buffer); /** * sg_copy_to_buffer - Copy from an SG list to a linear buffer * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to * @buflen: The number of bytes to copy * * Returns the number of copied bytes. * **/ size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen) { return sg_copy_buffer(sgl, nents, buf, buflen, 0, true); } EXPORT_SYMBOL(sg_copy_to_buffer); /** * sg_pcopy_from_buffer - Copy from a linear buffer to an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip) { return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false); } EXPORT_SYMBOL(sg_pcopy_from_buffer); /** * sg_pcopy_to_buffer - Copy from an SG list to a linear buffer * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip) { return sg_copy_buffer(sgl, nents, buf, buflen, skip, true); } EXPORT_SYMBOL(sg_pcopy_to_buffer); /** * sg_zero_buffer - Zero-out a part of a SG list * @sgl: The SG list * @nents: Number of SG entries * @buflen: The number of bytes to zero out * @skip: Number of bytes to skip before zeroing * * Returns the number of bytes zeroed. **/ size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, size_t buflen, off_t skip) { unsigned int offset = 0; struct sg_mapping_iter miter; unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_TO_SG; sg_miter_start(&miter, sgl, nents, sg_flags); if (!sg_miter_skip(&miter, skip)) return false; while (offset < buflen && sg_miter_next(&miter)) { unsigned int len; len = min(miter.length, buflen - offset); memset(miter.addr, 0, len); offset += len; } sg_miter_stop(&miter); return offset; } EXPORT_SYMBOL(sg_zero_buffer); /* * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class * iterators, and add them to the scatterlist. */ static ssize_t extract_user_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { struct scatterlist *sg = sgtable->sgl + sgtable->nents; struct page **pages; unsigned int npages; ssize_t ret = 0, res; size_t len, off; /* We decant the page list into the tail of the scatterlist */ pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); pages -= sg_max; do { res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, extraction_flags, &off); if (res <= 0) goto failed; len = res; maxsize -= len; ret += len; npages = DIV_ROUND_UP(off + len, PAGE_SIZE); sg_max -= npages; for (; npages > 0; npages--) { struct page *page = *pages; size_t seg = min_t(size_t, PAGE_SIZE - off, len); *pages++ = NULL; sg_set_page(sg, page, seg, off); sgtable->nents++; sg++; len -= seg; off = 0; } } while (maxsize > 0 && sg_max > 0); return ret; failed: while (sgtable->nents > sgtable->orig_nents) unpin_user_page(sg_page(&sgtable->sgl[--sgtable->nents])); return res; } /* * Extract up to sg_max pages from a BVEC-type iterator and add them to the * scatterlist. The pages are not pinned. */ static ssize_t extract_bvec_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct bio_vec *bv = iter->bvec; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned long start = iter->iov_offset; unsigned int i; ssize_t ret = 0; for (i = 0; i < iter->nr_segs; i++) { size_t off, len; len = bv[i].bv_len; if (start >= len) { start -= len; continue; } len = min_t(size_t, maxsize, len - start); off = bv[i].bv_offset + start; sg_set_page(sg, bv[i].bv_page, len, off); sgtable->nents++; sg++; sg_max--; ret += len; maxsize -= len; if (maxsize <= 0 || sg_max == 0) break; start = 0; } if (ret > 0) iov_iter_advance(iter, ret); return ret; } /* * Extract up to sg_max pages from a KVEC-type iterator and add them to the * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or * static buffers. The pages are not pinned. */ static ssize_t extract_kvec_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct kvec *kv = iter->kvec; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned long start = iter->iov_offset; unsigned int i; ssize_t ret = 0; for (i = 0; i < iter->nr_segs; i++) { struct page *page; unsigned long kaddr; size_t off, len, seg; len = kv[i].iov_len; if (start >= len) { start -= len; continue; } kaddr = (unsigned long)kv[i].iov_base + start; off = kaddr & ~PAGE_MASK; len = min_t(size_t, maxsize, len - start); kaddr &= PAGE_MASK; maxsize -= len; ret += len; do { seg = min_t(size_t, len, PAGE_SIZE - off); if (is_vmalloc_or_module_addr((void *)kaddr)) page = vmalloc_to_page((void *)kaddr); else page = virt_to_page((void *)kaddr); sg_set_page(sg, page, len, off); sgtable->nents++; sg++; sg_max--; len -= seg; kaddr += PAGE_SIZE; off = 0; } while (len > 0 && sg_max > 0); if (maxsize <= 0 || sg_max == 0) break; start = 0; } if (ret > 0) iov_iter_advance(iter, ret); return ret; } /* * Extract up to sg_max folios from an FOLIOQ-type iterator and add them to * the scatterlist. The pages are not pinned. */ static ssize_t extract_folioq_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct folio_queue *folioq = iter->folioq; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned int slot = iter->folioq_slot; ssize_t ret = 0; size_t offset = iter->iov_offset; BUG_ON(!folioq); if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; if (WARN_ON_ONCE(!folioq)) return 0; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t fsize = folioq_folio_size(folioq, slot); if (offset < fsize) { size_t part = umin(maxsize - ret, fsize - offset); sg_set_page(sg, folio_page(folio, 0), part, offset); sgtable->nents++; sg++; sg_max--; offset += part; ret += part; } if (offset >= fsize) { offset = 0; slot++; if (slot >= folioq_nr_slots(folioq)) { if (!folioq->next) { WARN_ON_ONCE(ret < iter->count); break; } folioq = folioq->next; slot = 0; } } } while (sg_max > 0 && ret < maxsize); iter->folioq = folioq; iter->folioq_slot = slot; iter->iov_offset = offset; iter->count -= ret; return ret; } /* * Extract up to sg_max folios from an XARRAY-type iterator and add them to * the scatterlist. The pages are not pinned. */ static ssize_t extract_xarray_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { struct scatterlist *sg = sgtable->sgl + sgtable->nents; struct xarray *xa = iter->xarray; struct folio *folio; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; ssize_t ret = 0; size_t offset, len; XA_STATE(xas, xa, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start); len = min_t(size_t, maxsize, folio_size(folio) - offset); sg_set_page(sg, folio_page(folio, 0), len, offset); sgtable->nents++; sg++; sg_max--; maxsize -= len; ret += len; if (maxsize <= 0 || sg_max == 0) break; } rcu_read_unlock(); if (ret > 0) iov_iter_advance(iter, ret); return ret; } /** * extract_iter_to_sg - Extract pages from an iterator and add to an sglist * @iter: The iterator to extract from * @maxsize: The amount of iterator to copy * @sgtable: The scatterlist table to fill in * @sg_max: Maximum number of elements in @sgtable that may be filled * @extraction_flags: Flags to qualify the request * * Extract the page fragments from the given amount of the source iterator and * add them to a scatterlist that refers to all of those bits, to a maximum * addition of @sg_max elements. * * The pages referred to by UBUF- and IOVEC-type iterators are extracted and * pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't * pinned; DISCARD-type is not supported. * * No end mark is placed on the scatterlist; that's left to the caller. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * If successful, @sgtable->nents is updated to include the number of elements * added and the number of bytes added is returned. @sgtable->orig_nents is * left unaltered. * * The iov_iter_extract_mode() function should be used to query how cleanup * should be performed. */ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { if (maxsize == 0) return 0; switch (iov_iter_type(iter)) { case ITER_UBUF: case ITER_IOVEC: return extract_user_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_BVEC: return extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_KVEC: return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_FOLIOQ: return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_XARRAY: return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); default: pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter)); WARN_ON_ONCE(1); return -EIO; } } EXPORT_SYMBOL_GPL(extract_iter_to_sg);
10 10 10 1 2 5 2 3 4 10 4 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0 /* * xfrm4_input.c * * Changes: * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> * Add Encapsulation support * */ #include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/protocol.h> #include <net/gro.h> static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { return dst_input(skb); } static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), skb->dev)) goto drop; } if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) goto drop; return 0; drop: kfree_skb(skb); return NET_RX_DROP; } int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); struct iphdr *iph = ip_hdr(skb); iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; #ifndef CONFIG_NETFILTER if (!async) return -iph->protocol; #endif __skb_push(skb, -skb_network_offset(skb)); iph->tot_len = htons(skb->len); ip_send_check(iph); if (xo && (xo->flags & XFRM_GRO)) { /* The full l2 header needs to be preserved so that re-injecting the packet at l2 * works correctly in the presence of vlan tags. */ skb_mac_header_rebuild_full(skb, xo->orig_mac_len); skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull) { struct udp_sock *up = udp_sk(sk); struct udphdr *uh; struct iphdr *iph; int iphlen, len; __u8 *udpdata; __be32 *udpdata32; u16 encap_type; encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; /* If this is a paged skb, make sure we pull up * whatever data we need to look at. */ len = skb->len - sizeof(struct udphdr); if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) return 1; /* Now we can get the pointers */ uh = udp_hdr(skb); udpdata = (__u8 *)uh + sizeof(struct udphdr); udpdata32 = (__be32 *)udpdata; switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { return -EINVAL; } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); } else /* Must be an IKE packet.. pass it through */ return 1; break; } /* At this point we are sure that this is an ESPinUDP packet, * so we need to remove 'len' bytes from the packet (the UDP * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ if (skb_unclone(skb, GFP_ATOMIC)) return -EINVAL; /* Now we can update and verify the packet length... */ iph = ip_hdr(skb); iphlen = iph->ihl << 2; iph->tot_len = htons(ntohs(iph->tot_len) - len); if (skb->len < iphlen + len) { /* packet is too small!?! */ return -EINVAL; } /* pull the data buffer up to the ESP header and set the * transport header to point to ESP. Keep UDP on the stack * for later. */ if (pull) { __skb_pull(skb, len); skb_reset_transport_header(skb); } else { skb_set_transport_header(skb, len); } /* process ESP */ return 0; } /* If it's a keepalive packet, then just eat it. * If it's an encapsulated packet, then pass it to the * IPsec xfrm input. * Returns 0 if skb passed to xfrm or was dropped. * Returns >0 if skb should be passed to UDP. * Returns <0 if skb should be resubmitted (-ret is protocol) */ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) { int ret; ret = __xfrm4_udp_encap_rcv(sk, skb, true); if (!ret) return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, udp_sk(sk)->encap_type); if (ret < 0) { kfree_skb(skb); return 0; } return ret; } EXPORT_SYMBOL(xfrm4_udp_encap_rcv); struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; int ret; offset = offset - sizeof(struct udphdr); if (!pskb_pull(skb, offset)) return NULL; rcu_read_lock(); ops = rcu_dereference(inet_offloads[IPPROTO_ESP]); if (!ops || !ops->callbacks.gro_receive) goto out; ret = __xfrm4_udp_encap_rcv(sk, skb, false); if (ret) goto out; skb_push(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); rcu_read_unlock(); return pp; out: rcu_read_unlock(); skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } EXPORT_SYMBOL(xfrm4_gro_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); } EXPORT_SYMBOL(xfrm4_rcv);
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 // SPDX-License-Identifier: GPL-2.0-or-later /* * Driver for the Conexant CX2584x Audio/Video decoder chip and related cores * * Integrated Consumer Infrared Controller * * Copyright (C) 2010 Andy Walls <awalls@md.metrocast.net> */ #include <linux/slab.h> #include <linux/kfifo.h> #include <linux/module.h> #include <media/drv-intf/cx25840.h> #include <media/rc-core.h> #include "cx25840-core.h" static unsigned int ir_debug; module_param(ir_debug, int, 0644); MODULE_PARM_DESC(ir_debug, "enable integrated IR debug messages"); #define CX25840_IR_REG_BASE 0x200 #define CX25840_IR_CNTRL_REG 0x200 #define CNTRL_WIN_3_3 0x00000000 #define CNTRL_WIN_4_3 0x00000001 #define CNTRL_WIN_3_4 0x00000002 #define CNTRL_WIN_4_4 0x00000003 #define CNTRL_WIN 0x00000003 #define CNTRL_EDG_NONE 0x00000000 #define CNTRL_EDG_FALL 0x00000004 #define CNTRL_EDG_RISE 0x00000008 #define CNTRL_EDG_BOTH 0x0000000C #define CNTRL_EDG 0x0000000C #define CNTRL_DMD 0x00000010 #define CNTRL_MOD 0x00000020 #define CNTRL_RFE 0x00000040 #define CNTRL_TFE 0x00000080 #define CNTRL_RXE 0x00000100 #define CNTRL_TXE 0x00000200 #define CNTRL_RIC 0x00000400 #define CNTRL_TIC 0x00000800 #define CNTRL_CPL 0x00001000 #define CNTRL_LBM 0x00002000 #define CNTRL_R 0x00004000 #define CX25840_IR_TXCLK_REG 0x204 #define TXCLK_TCD 0x0000FFFF #define CX25840_IR_RXCLK_REG 0x208 #define RXCLK_RCD 0x0000FFFF #define CX25840_IR_CDUTY_REG 0x20C #define CDUTY_CDC 0x0000000F #define CX25840_IR_STATS_REG 0x210 #define STATS_RTO 0x00000001 #define STATS_ROR 0x00000002 #define STATS_RBY 0x00000004 #define STATS_TBY 0x00000008 #define STATS_RSR 0x00000010 #define STATS_TSR 0x00000020 #define CX25840_IR_IRQEN_REG 0x214 #define IRQEN_RTE 0x00000001 #define IRQEN_ROE 0x00000002 #define IRQEN_RSE 0x00000010 #define IRQEN_TSE 0x00000020 #define IRQEN_MSK 0x00000033 #define CX25840_IR_FILTR_REG 0x218 #define FILTR_LPF 0x0000FFFF #define CX25840_IR_FIFO_REG 0x23C #define FIFO_RXTX 0x0000FFFF #define FIFO_RXTX_LVL 0x00010000 #define FIFO_RXTX_RTO 0x0001FFFF #define FIFO_RX_NDV 0x00020000 #define FIFO_RX_DEPTH 8 #define FIFO_TX_DEPTH 8 #define CX25840_VIDCLK_FREQ 108000000 /* 108 MHz, BT.656 */ #define CX25840_IR_REFCLK_FREQ (CX25840_VIDCLK_FREQ / 2) /* * We use this union internally for convenience, but callers to tx_write * and rx_read will be expecting records of type struct ir_raw_event. * Always ensure the size of this union is dictated by struct ir_raw_event. */ union cx25840_ir_fifo_rec { u32 hw_fifo_data; struct ir_raw_event ir_core_data; }; #define CX25840_IR_RX_KFIFO_SIZE (256 * sizeof(union cx25840_ir_fifo_rec)) #define CX25840_IR_TX_KFIFO_SIZE (256 * sizeof(union cx25840_ir_fifo_rec)) struct cx25840_ir_state { struct i2c_client *c; struct v4l2_subdev_ir_parameters rx_params; struct mutex rx_params_lock; /* protects Rx parameter settings cache */ atomic_t rxclk_divider; atomic_t rx_invert; struct kfifo rx_kfifo; spinlock_t rx_kfifo_lock; /* protect Rx data kfifo */ struct v4l2_subdev_ir_parameters tx_params; struct mutex tx_params_lock; /* protects Tx parameter settings cache */ atomic_t txclk_divider; }; static inline struct cx25840_ir_state *to_ir_state(struct v4l2_subdev *sd) { struct cx25840_state *state = to_state(sd); return state ? state->ir_state : NULL; } /* * Rx and Tx Clock Divider register computations * * Note the largest clock divider value of 0xffff corresponds to: * (0xffff + 1) * 1000 / 108/2 MHz = 1,213,629.629... ns * which fits in 21 bits, so we'll use unsigned int for time arguments. */ static inline u16 count_to_clock_divider(unsigned int d) { if (d > RXCLK_RCD + 1) d = RXCLK_RCD; else if (d < 2) d = 1; else d--; return (u16) d; } static inline u16 carrier_freq_to_clock_divider(unsigned int freq) { return count_to_clock_divider( DIV_ROUND_CLOSEST(CX25840_IR_REFCLK_FREQ, freq * 16)); } static inline unsigned int clock_divider_to_carrier_freq(unsigned int divider) { return DIV_ROUND_CLOSEST(CX25840_IR_REFCLK_FREQ, (divider + 1) * 16); } static inline unsigned int clock_divider_to_freq(unsigned int divider, unsigned int rollovers) { return DIV_ROUND_CLOSEST(CX25840_IR_REFCLK_FREQ, (divider + 1) * rollovers); } /* * Low Pass Filter register calculations * * Note the largest count value of 0xffff corresponds to: * 0xffff * 1000 / 108/2 MHz = 1,213,611.11... ns * which fits in 21 bits, so we'll use unsigned int for time arguments. */ static inline u16 count_to_lpf_count(unsigned int d) { if (d > FILTR_LPF) d = FILTR_LPF; else if (d < 4) d = 0; return (u16) d; } static inline u16 ns_to_lpf_count(unsigned int ns) { return count_to_lpf_count( DIV_ROUND_CLOSEST(CX25840_IR_REFCLK_FREQ / 1000000 * ns, 1000)); } static inline unsigned int lpf_count_to_ns(unsigned int count) { /* Duration of the Low Pass Filter rejection window in ns */ return DIV_ROUND_CLOSEST(count * 1000, CX25840_IR_REFCLK_FREQ / 1000000); } static inline unsigned int lpf_count_to_us(unsigned int count) { /* Duration of the Low Pass Filter rejection window in us */ return DIV_ROUND_CLOSEST(count, CX25840_IR_REFCLK_FREQ / 1000000); } /* * FIFO register pulse width count computations */ static u32 clock_divider_to_resolution(u16 divider) { /* * Resolution is the duration of 1 tick of the readable portion of * the pulse width counter as read from the FIFO. The two lsb's are * not readable, hence the << 2. This function returns ns. */ return DIV_ROUND_CLOSEST((1 << 2) * ((u32) divider + 1) * 1000, CX25840_IR_REFCLK_FREQ / 1000000); } static u64 pulse_width_count_to_ns(u16 count, u16 divider) { u64 n; u32 rem; /* * The 2 lsb's of the pulse width timer count are not readable, hence * the (count << 2) | 0x3 */ n = (((u64) count << 2) | 0x3) * (divider + 1) * 1000; /* millicycles */ rem = do_div(n, CX25840_IR_REFCLK_FREQ / 1000000); /* / MHz => ns */ if (rem >= CX25840_IR_REFCLK_FREQ / 1000000 / 2) n++; return n; } #if 0 /* Keep as we will need this for Transmit functionality */ static u16 ns_to_pulse_width_count(u32 ns, u16 divider) { u64 n; u32 d; u32 rem; /* * The 2 lsb's of the pulse width timer count are not accessible, hence * the (1 << 2) */ n = ((u64) ns) * CX25840_IR_REFCLK_FREQ / 1000000; /* millicycles */ d = (1 << 2) * ((u32) divider + 1) * 1000; /* millicycles/count */ rem = do_div(n, d); if (rem >= d / 2) n++; if (n > FIFO_RXTX) n = FIFO_RXTX; else if (n == 0) n = 1; return (u16) n; } #endif static unsigned int pulse_width_count_to_us(u16 count, u16 divider) { u64 n; u32 rem; /* * The 2 lsb's of the pulse width timer count are not readable, hence * the (count << 2) | 0x3 */ n = (((u64) count << 2) | 0x3) * (divider + 1); /* cycles */ rem = do_div(n, CX25840_IR_REFCLK_FREQ / 1000000); /* / MHz => us */ if (rem >= CX25840_IR_REFCLK_FREQ / 1000000 / 2) n++; return (unsigned int) n; } /* * Pulse Clocks computations: Combined Pulse Width Count & Rx Clock Counts * * The total pulse clock count is an 18 bit pulse width timer count as the most * significant part and (up to) 16 bit clock divider count as a modulus. * When the Rx clock divider ticks down to 0, it increments the 18 bit pulse * width timer count's least significant bit. */ static u64 ns_to_pulse_clocks(u32 ns) { u64 clocks; u32 rem; clocks = CX25840_IR_REFCLK_FREQ / 1000000 * (u64) ns; /* millicycles */ rem = do_div(clocks, 1000); /* /1000 = cycles */ if (rem >= 1000 / 2) clocks++; return clocks; } static u16 pulse_clocks_to_clock_divider(u64 count) { do_div(count, (FIFO_RXTX << 2) | 0x3); /* net result needs to be rounded down and decremented by 1 */ if (count > RXCLK_RCD + 1) count = RXCLK_RCD; else if (count < 2) count = 1; else count--; return (u16) count; } /* * IR Control Register helpers */ enum tx_fifo_watermark { TX_FIFO_HALF_EMPTY = 0, TX_FIFO_EMPTY = CNTRL_TIC, }; enum rx_fifo_watermark { RX_FIFO_HALF_FULL = 0, RX_FIFO_NOT_EMPTY = CNTRL_RIC, }; static inline void control_tx_irq_watermark(struct i2c_client *c, enum tx_fifo_watermark level) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~CNTRL_TIC, level); } static inline void control_rx_irq_watermark(struct i2c_client *c, enum rx_fifo_watermark level) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~CNTRL_RIC, level); } static inline void control_tx_enable(struct i2c_client *c, bool enable) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~(CNTRL_TXE | CNTRL_TFE), enable ? (CNTRL_TXE | CNTRL_TFE) : 0); } static inline void control_rx_enable(struct i2c_client *c, bool enable) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~(CNTRL_RXE | CNTRL_RFE), enable ? (CNTRL_RXE | CNTRL_RFE) : 0); } static inline void control_tx_modulation_enable(struct i2c_client *c, bool enable) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~CNTRL_MOD, enable ? CNTRL_MOD : 0); } static inline void control_rx_demodulation_enable(struct i2c_client *c, bool enable) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~CNTRL_DMD, enable ? CNTRL_DMD : 0); } static inline void control_rx_s_edge_detection(struct i2c_client *c, u32 edge_types) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~CNTRL_EDG_BOTH, edge_types & CNTRL_EDG_BOTH); } static void control_rx_s_carrier_window(struct i2c_client *c, unsigned int carrier, unsigned int *carrier_range_low, unsigned int *carrier_range_high) { u32 v; unsigned int c16 = carrier * 16; if (*carrier_range_low < DIV_ROUND_CLOSEST(c16, 16 + 3)) { v = CNTRL_WIN_3_4; *carrier_range_low = DIV_ROUND_CLOSEST(c16, 16 + 4); } else { v = CNTRL_WIN_3_3; *carrier_range_low = DIV_ROUND_CLOSEST(c16, 16 + 3); } if (*carrier_range_high > DIV_ROUND_CLOSEST(c16, 16 - 3)) { v |= CNTRL_WIN_4_3; *carrier_range_high = DIV_ROUND_CLOSEST(c16, 16 - 4); } else { v |= CNTRL_WIN_3_3; *carrier_range_high = DIV_ROUND_CLOSEST(c16, 16 - 3); } cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~CNTRL_WIN, v); } static inline void control_tx_polarity_invert(struct i2c_client *c, bool invert) { cx25840_and_or4(c, CX25840_IR_CNTRL_REG, ~CNTRL_CPL, invert ? CNTRL_CPL : 0); } /* * IR Rx & Tx Clock Register helpers */ static unsigned int txclk_tx_s_carrier(struct i2c_client *c, unsigned int freq, u16 *divider) { *divider = carrier_freq_to_clock_divider(freq); cx25840_write4(c, CX25840_IR_TXCLK_REG, *divider); return clock_divider_to_carrier_freq(*divider); } static unsigned int rxclk_rx_s_carrier(struct i2c_client *c, unsigned int freq, u16 *divider) { *divider = carrier_freq_to_clock_divider(freq); cx25840_write4(c, CX25840_IR_RXCLK_REG, *divider); return clock_divider_to_carrier_freq(*divider); } static u32 txclk_tx_s_max_pulse_width(struct i2c_client *c, u32 ns, u16 *divider) { u64 pulse_clocks; if (ns > IR_MAX_DURATION) ns = IR_MAX_DURATION; pulse_clocks = ns_to_pulse_clocks(ns); *divider = pulse_clocks_to_clock_divider(pulse_clocks); cx25840_write4(c, CX25840_IR_TXCLK_REG, *divider); return (u32) pulse_width_count_to_ns(FIFO_RXTX, *divider); } static u32 rxclk_rx_s_max_pulse_width(struct i2c_client *c, u32 ns, u16 *divider) { u64 pulse_clocks; if (ns > IR_MAX_DURATION) ns = IR_MAX_DURATION; pulse_clocks = ns_to_pulse_clocks(ns); *divider = pulse_clocks_to_clock_divider(pulse_clocks); cx25840_write4(c, CX25840_IR_RXCLK_REG, *divider); return (u32) pulse_width_count_to_ns(FIFO_RXTX, *divider); } /* * IR Tx Carrier Duty Cycle register helpers */ static unsigned int cduty_tx_s_duty_cycle(struct i2c_client *c, unsigned int duty_cycle) { u32 n; n = DIV_ROUND_CLOSEST(duty_cycle * 100, 625); /* 16ths of 100% */ if (n != 0) n--; if (n > 15) n = 15; cx25840_write4(c, CX25840_IR_CDUTY_REG, n); return DIV_ROUND_CLOSEST((n + 1) * 100, 16); } /* * IR Filter Register helpers */ static u32 filter_rx_s_min_width(struct i2c_client *c, u32 min_width_ns) { u32 count = ns_to_lpf_count(min_width_ns); cx25840_write4(c, CX25840_IR_FILTR_REG, count); return lpf_count_to_ns(count); } /* * IR IRQ Enable Register helpers */ static inline void irqenable_rx(struct v4l2_subdev *sd, u32 mask) { struct cx25840_state *state = to_state(sd); if (is_cx23885(state) || is_cx23887(state)) mask ^= IRQEN_MSK; mask &= (IRQEN_RTE | IRQEN_ROE | IRQEN_RSE); cx25840_and_or4(state->c, CX25840_IR_IRQEN_REG, ~(IRQEN_RTE | IRQEN_ROE | IRQEN_RSE), mask); } static inline void irqenable_tx(struct v4l2_subdev *sd, u32 mask) { struct cx25840_state *state = to_state(sd); if (is_cx23885(state) || is_cx23887(state)) mask ^= IRQEN_MSK; mask &= IRQEN_TSE; cx25840_and_or4(state->c, CX25840_IR_IRQEN_REG, ~IRQEN_TSE, mask); } /* * V4L2 Subdevice IR Ops */ int cx25840_ir_irq_handler(struct v4l2_subdev *sd, u32 status, bool *handled) { struct cx25840_state *state = to_state(sd); struct cx25840_ir_state *ir_state = to_ir_state(sd); struct i2c_client *c = NULL; unsigned long flags; union cx25840_ir_fifo_rec rx_data[FIFO_RX_DEPTH]; unsigned int i, j, k; u32 events, v; int tsr, rsr, rto, ror, tse, rse, rte, roe, kror; u32 cntrl, irqen, stats; *handled = false; if (ir_state == NULL) return -ENODEV; c = ir_state->c; /* Only support the IR controller for the CX2388[57] AV Core for now */ if (!(is_cx23885(state) || is_cx23887(state))) return -ENODEV; cntrl = cx25840_read4(c, CX25840_IR_CNTRL_REG); irqen = cx25840_read4(c, CX25840_IR_IRQEN_REG); if (is_cx23885(state) || is_cx23887(state)) irqen ^= IRQEN_MSK; stats = cx25840_read4(c, CX25840_IR_STATS_REG); tsr = stats & STATS_TSR; /* Tx FIFO Service Request */ rsr = stats & STATS_RSR; /* Rx FIFO Service Request */ rto = stats & STATS_RTO; /* Rx Pulse Width Timer Time Out */ ror = stats & STATS_ROR; /* Rx FIFO Over Run */ tse = irqen & IRQEN_TSE; /* Tx FIFO Service Request IRQ Enable */ rse = irqen & IRQEN_RSE; /* Rx FIFO Service Request IRQ Enable */ rte = irqen & IRQEN_RTE; /* Rx Pulse Width Timer Time Out IRQ Enable */ roe = irqen & IRQEN_ROE; /* Rx FIFO Over Run IRQ Enable */ v4l2_dbg(2, ir_debug, sd, "IR IRQ Status: %s %s %s %s %s %s\n", tsr ? "tsr" : " ", rsr ? "rsr" : " ", rto ? "rto" : " ", ror ? "ror" : " ", stats & STATS_TBY ? "tby" : " ", stats & STATS_RBY ? "rby" : " "); v4l2_dbg(2, ir_debug, sd, "IR IRQ Enables: %s %s %s %s\n", tse ? "tse" : " ", rse ? "rse" : " ", rte ? "rte" : " ", roe ? "roe" : " "); /* * Transmitter interrupt service */ if (tse && tsr) { /* * TODO: * Check the watermark threshold setting * Pull FIFO_TX_DEPTH or FIFO_TX_DEPTH/2 entries from tx_kfifo * Push the data to the hardware FIFO. * If there was nothing more to send in the tx_kfifo, disable * the TSR IRQ and notify the v4l2_device. * If there was something in the tx_kfifo, check the tx_kfifo * level and notify the v4l2_device, if it is low. */ /* For now, inhibit TSR interrupt until Tx is implemented */ irqenable_tx(sd, 0); events = V4L2_SUBDEV_IR_TX_FIFO_SERVICE_REQ; v4l2_subdev_notify(sd, V4L2_SUBDEV_IR_TX_NOTIFY, &events); *handled = true; } /* * Receiver interrupt service */ kror = 0; if ((rse && rsr) || (rte && rto)) { /* * Receive data on RSR to clear the STATS_RSR. * Receive data on RTO, since we may not have yet hit the RSR * watermark when we receive the RTO. */ for (i = 0, v = FIFO_RX_NDV; (v & FIFO_RX_NDV) && !kror; i = 0) { for (j = 0; (v & FIFO_RX_NDV) && j < FIFO_RX_DEPTH; j++) { v = cx25840_read4(c, CX25840_IR_FIFO_REG); rx_data[i].hw_fifo_data = v & ~FIFO_RX_NDV; i++; } if (i == 0) break; j = i * sizeof(union cx25840_ir_fifo_rec); k = kfifo_in_locked(&ir_state->rx_kfifo, (unsigned char *) rx_data, j, &ir_state->rx_kfifo_lock); if (k != j) kror++; /* rx_kfifo over run */ } *handled = true; } events = 0; v = 0; if (kror) { events |= V4L2_SUBDEV_IR_RX_SW_FIFO_OVERRUN; v4l2_err(sd, "IR receiver software FIFO overrun\n"); } if (roe && ror) { /* * The RX FIFO Enable (CNTRL_RFE) must be toggled to clear * the Rx FIFO Over Run status (STATS_ROR) */ v |= CNTRL_RFE; events |= V4L2_SUBDEV_IR_RX_HW_FIFO_OVERRUN; v4l2_err(sd, "IR receiver hardware FIFO overrun\n"); } if (rte && rto) { /* * The IR Receiver Enable (CNTRL_RXE) must be toggled to clear * the Rx Pulse Width Timer Time Out (STATS_RTO) */ v |= CNTRL_RXE; events |= V4L2_SUBDEV_IR_RX_END_OF_RX_DETECTED; } if (v) { /* Clear STATS_ROR & STATS_RTO as needed by resetting hardware */ cx25840_write4(c, CX25840_IR_CNTRL_REG, cntrl & ~v); cx25840_write4(c, CX25840_IR_CNTRL_REG, cntrl); *handled = true; } spin_lock_irqsave(&ir_state->rx_kfifo_lock, flags); if (kfifo_len(&ir_state->rx_kfifo) >= CX25840_IR_RX_KFIFO_SIZE / 2) events |= V4L2_SUBDEV_IR_RX_FIFO_SERVICE_REQ; spin_unlock_irqrestore(&ir_state->rx_kfifo_lock, flags); if (events) v4l2_subdev_notify(sd, V4L2_SUBDEV_IR_RX_NOTIFY, &events); return 0; } /* Receiver */ static int cx25840_ir_rx_read(struct v4l2_subdev *sd, u8 *buf, size_t count, ssize_t *num) { struct cx25840_ir_state *ir_state = to_ir_state(sd); bool invert; u16 divider; unsigned int i, n; union cx25840_ir_fifo_rec *p; unsigned u, v, w; if (ir_state == NULL) return -ENODEV; invert = (bool) atomic_read(&ir_state->rx_invert); divider = (u16) atomic_read(&ir_state->rxclk_divider); n = count / sizeof(union cx25840_ir_fifo_rec) * sizeof(union cx25840_ir_fifo_rec); if (n == 0) { *num = 0; return 0; } n = kfifo_out_locked(&ir_state->rx_kfifo, buf, n, &ir_state->rx_kfifo_lock); n /= sizeof(union cx25840_ir_fifo_rec); *num = n * sizeof(union cx25840_ir_fifo_rec); for (p = (union cx25840_ir_fifo_rec *) buf, i = 0; i < n; p++, i++) { if ((p->hw_fifo_data & FIFO_RXTX_RTO) == FIFO_RXTX_RTO) { /* Assume RTO was because of no IR light input */ u = 0; w = 1; } else { u = (p->hw_fifo_data & FIFO_RXTX_LVL) ? 1 : 0; if (invert) u = u ? 0 : 1; w = 0; } v = (unsigned) pulse_width_count_to_ns( (u16)(p->hw_fifo_data & FIFO_RXTX), divider) / 1000; if (v > IR_MAX_DURATION) v = IR_MAX_DURATION; p->ir_core_data = (struct ir_raw_event) { .pulse = u, .duration = v, .timeout = w }; v4l2_dbg(2, ir_debug, sd, "rx read: %10u ns %s %s\n", v, u ? "mark" : "space", w ? "(timed out)" : ""); if (w) v4l2_dbg(2, ir_debug, sd, "rx read: end of rx\n"); } return 0; } static int cx25840_ir_rx_g_parameters(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *p) { struct cx25840_ir_state *ir_state = to_ir_state(sd); if (ir_state == NULL) return -ENODEV; mutex_lock(&ir_state->rx_params_lock); memcpy(p, &ir_state->rx_params, sizeof(struct v4l2_subdev_ir_parameters)); mutex_unlock(&ir_state->rx_params_lock); return 0; } static int cx25840_ir_rx_shutdown(struct v4l2_subdev *sd) { struct cx25840_ir_state *ir_state = to_ir_state(sd); struct i2c_client *c; if (ir_state == NULL) return -ENODEV; c = ir_state->c; mutex_lock(&ir_state->rx_params_lock); /* Disable or slow down all IR Rx circuits and counters */ irqenable_rx(sd, 0); control_rx_enable(c, false); control_rx_demodulation_enable(c, false); control_rx_s_edge_detection(c, CNTRL_EDG_NONE); filter_rx_s_min_width(c, 0); cx25840_write4(c, CX25840_IR_RXCLK_REG, RXCLK_RCD); ir_state->rx_params.shutdown = true; mutex_unlock(&ir_state->rx_params_lock); return 0; } static int cx25840_ir_rx_s_parameters(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *p) { struct cx25840_ir_state *ir_state = to_ir_state(sd); struct i2c_client *c; struct v4l2_subdev_ir_parameters *o; u16 rxclk_divider; if (ir_state == NULL) return -ENODEV; if (p->shutdown) return cx25840_ir_rx_shutdown(sd); if (p->mode != V4L2_SUBDEV_IR_MODE_PULSE_WIDTH) return -ENOSYS; c = ir_state->c; o = &ir_state->rx_params; mutex_lock(&ir_state->rx_params_lock); o->shutdown = p->shutdown; p->mode = V4L2_SUBDEV_IR_MODE_PULSE_WIDTH; o->mode = p->mode; p->bytes_per_data_element = sizeof(union cx25840_ir_fifo_rec); o->bytes_per_data_element = p->bytes_per_data_element; /* Before we tweak the hardware, we have to disable the receiver */ irqenable_rx(sd, 0); control_rx_enable(c, false); control_rx_demodulation_enable(c, p->modulation); o->modulation = p->modulation; if (p->modulation) { p->carrier_freq = rxclk_rx_s_carrier(c, p->carrier_freq, &rxclk_divider); o->carrier_freq = p->carrier_freq; p->duty_cycle = 50; o->duty_cycle = p->duty_cycle; control_rx_s_carrier_window(c, p->carrier_freq, &p->carrier_range_lower, &p->carrier_range_upper); o->carrier_range_lower = p->carrier_range_lower; o->carrier_range_upper = p->carrier_range_upper; p->max_pulse_width = (u32) pulse_width_count_to_ns(FIFO_RXTX, rxclk_divider); } else { p->max_pulse_width = rxclk_rx_s_max_pulse_width(c, p->max_pulse_width, &rxclk_divider); } o->max_pulse_width = p->max_pulse_width; atomic_set(&ir_state->rxclk_divider, rxclk_divider); p->noise_filter_min_width = filter_rx_s_min_width(c, p->noise_filter_min_width); o->noise_filter_min_width = p->noise_filter_min_width; p->resolution = clock_divider_to_resolution(rxclk_divider); o->resolution = p->resolution; /* FIXME - make this dependent on resolution for better performance */ control_rx_irq_watermark(c, RX_FIFO_HALF_FULL); control_rx_s_edge_detection(c, CNTRL_EDG_BOTH); o->invert_level = p->invert_level; atomic_set(&ir_state->rx_invert, p->invert_level); o->interrupt_enable = p->interrupt_enable; o->enable = p->enable; if (p->enable) { unsigned long flags; spin_lock_irqsave(&ir_state->rx_kfifo_lock, flags); kfifo_reset(&ir_state->rx_kfifo); spin_unlock_irqrestore(&ir_state->rx_kfifo_lock, flags); if (p->interrupt_enable) irqenable_rx(sd, IRQEN_RSE | IRQEN_RTE | IRQEN_ROE); control_rx_enable(c, p->enable); } mutex_unlock(&ir_state->rx_params_lock); return 0; } /* Transmitter */ static int cx25840_ir_tx_write(struct v4l2_subdev *sd, u8 *buf, size_t count, ssize_t *num) { struct cx25840_ir_state *ir_state = to_ir_state(sd); if (ir_state == NULL) return -ENODEV; #if 0 /* * FIXME - the code below is an incomplete and untested sketch of what * may need to be done. The critical part is to get 4 (or 8) pulses * from the tx_kfifo, or converted from ns to the proper units from the * input, and push them off to the hardware Tx FIFO right away, if the * HW TX fifo needs service. The rest can be pushed to the tx_kfifo in * a less critical timeframe. Also watch out for overruning the * tx_kfifo - don't let it happen and let the caller know not all his * pulses were written. */ u32 *ns_pulse = (u32 *) buf; unsigned int n; u32 fifo_pulse[FIFO_TX_DEPTH]; u32 mark; /* Compute how much we can fit in the tx kfifo */ n = CX25840_IR_TX_KFIFO_SIZE - kfifo_len(ir_state->tx_kfifo); n = min(n, (unsigned int) count); n /= sizeof(u32); /* FIXME - turn on Tx Fifo service interrupt * check hardware fifo level, and other stuff */ for (i = 0; i < n; ) { for (j = 0; j < FIFO_TX_DEPTH / 2 && i < n; j++) { mark = ns_pulse[i] & LEVEL_MASK; fifo_pulse[j] = ns_to_pulse_width_count( ns_pulse[i] & ~LEVEL_MASK, ir_state->txclk_divider); if (mark) fifo_pulse[j] &= FIFO_RXTX_LVL; i++; } kfifo_put(ir_state->tx_kfifo, (u8 *) fifo_pulse, j * sizeof(u32)); } *num = n * sizeof(u32); #else /* For now enable the Tx FIFO Service interrupt & pretend we did work */ irqenable_tx(sd, IRQEN_TSE); *num = count; #endif return 0; } static int cx25840_ir_tx_g_parameters(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *p) { struct cx25840_ir_state *ir_state = to_ir_state(sd); if (ir_state == NULL) return -ENODEV; mutex_lock(&ir_state->tx_params_lock); memcpy(p, &ir_state->tx_params, sizeof(struct v4l2_subdev_ir_parameters)); mutex_unlock(&ir_state->tx_params_lock); return 0; } static int cx25840_ir_tx_shutdown(struct v4l2_subdev *sd) { struct cx25840_ir_state *ir_state = to_ir_state(sd); struct i2c_client *c; if (ir_state == NULL) return -ENODEV; c = ir_state->c; mutex_lock(&ir_state->tx_params_lock); /* Disable or slow down all IR Tx circuits and counters */ irqenable_tx(sd, 0); control_tx_enable(c, false); control_tx_modulation_enable(c, false); cx25840_write4(c, CX25840_IR_TXCLK_REG, TXCLK_TCD); ir_state->tx_params.shutdown = true; mutex_unlock(&ir_state->tx_params_lock); return 0; } static int cx25840_ir_tx_s_parameters(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *p) { struct cx25840_ir_state *ir_state = to_ir_state(sd); struct i2c_client *c; struct v4l2_subdev_ir_parameters *o; u16 txclk_divider; if (ir_state == NULL) return -ENODEV; if (p->shutdown) return cx25840_ir_tx_shutdown(sd); if (p->mode != V4L2_SUBDEV_IR_MODE_PULSE_WIDTH) return -ENOSYS; c = ir_state->c; o = &ir_state->tx_params; mutex_lock(&ir_state->tx_params_lock); o->shutdown = p->shutdown; p->mode = V4L2_SUBDEV_IR_MODE_PULSE_WIDTH; o->mode = p->mode; p->bytes_per_data_element = sizeof(union cx25840_ir_fifo_rec); o->bytes_per_data_element = p->bytes_per_data_element; /* Before we tweak the hardware, we have to disable the transmitter */ irqenable_tx(sd, 0); control_tx_enable(c, false); control_tx_modulation_enable(c, p->modulation); o->modulation = p->modulation; if (p->modulation) { p->carrier_freq = txclk_tx_s_carrier(c, p->carrier_freq, &txclk_divider); o->carrier_freq = p->carrier_freq; p->duty_cycle = cduty_tx_s_duty_cycle(c, p->duty_cycle); o->duty_cycle = p->duty_cycle; p->max_pulse_width = (u32) pulse_width_count_to_ns(FIFO_RXTX, txclk_divider); } else { p->max_pulse_width = txclk_tx_s_max_pulse_width(c, p->max_pulse_width, &txclk_divider); } o->max_pulse_width = p->max_pulse_width; atomic_set(&ir_state->txclk_divider, txclk_divider); p->resolution = clock_divider_to_resolution(txclk_divider); o->resolution = p->resolution; /* FIXME - make this dependent on resolution for better performance */ control_tx_irq_watermark(c, TX_FIFO_HALF_EMPTY); control_tx_polarity_invert(c, p->invert_carrier_sense); o->invert_carrier_sense = p->invert_carrier_sense; /* * FIXME: we don't have hardware help for IO pin level inversion * here like we have on the CX23888. * Act on this with some mix of logical inversion of data levels, * carrier polarity, and carrier duty cycle. */ o->invert_level = p->invert_level; o->interrupt_enable = p->interrupt_enable; o->enable = p->enable; if (p->enable) { /* reset tx_fifo here */ if (p->interrupt_enable) irqenable_tx(sd, IRQEN_TSE); control_tx_enable(c, p->enable); } mutex_unlock(&ir_state->tx_params_lock); return 0; } /* * V4L2 Subdevice Core Ops support */ int cx25840_ir_log_status(struct v4l2_subdev *sd) { struct cx25840_state *state = to_state(sd); struct i2c_client *c = state->c; char *s; int i, j; u32 cntrl, txclk, rxclk, cduty, stats, irqen, filtr; /* The CX23888 chip doesn't have an IR controller on the A/V core */ if (is_cx23888(state)) return 0; cntrl = cx25840_read4(c, CX25840_IR_CNTRL_REG); txclk = cx25840_read4(c, CX25840_IR_TXCLK_REG) & TXCLK_TCD; rxclk = cx25840_read4(c, CX25840_IR_RXCLK_REG) & RXCLK_RCD; cduty = cx25840_read4(c, CX25840_IR_CDUTY_REG) & CDUTY_CDC; stats = cx25840_read4(c, CX25840_IR_STATS_REG); irqen = cx25840_read4(c, CX25840_IR_IRQEN_REG); if (is_cx23885(state) || is_cx23887(state)) irqen ^= IRQEN_MSK; filtr = cx25840_read4(c, CX25840_IR_FILTR_REG) & FILTR_LPF; v4l2_info(sd, "IR Receiver:\n"); v4l2_info(sd, "\tEnabled: %s\n", cntrl & CNTRL_RXE ? "yes" : "no"); v4l2_info(sd, "\tDemodulation from a carrier: %s\n", cntrl & CNTRL_DMD ? "enabled" : "disabled"); v4l2_info(sd, "\tFIFO: %s\n", cntrl & CNTRL_RFE ? "enabled" : "disabled"); switch (cntrl & CNTRL_EDG) { case CNTRL_EDG_NONE: s = "disabled"; break; case CNTRL_EDG_FALL: s = "falling edge"; break; case CNTRL_EDG_RISE: s = "rising edge"; break; case CNTRL_EDG_BOTH: s = "rising & falling edges"; break; default: s = "??? edge"; break; } v4l2_info(sd, "\tPulse timers' start/stop trigger: %s\n", s); v4l2_info(sd, "\tFIFO data on pulse timer overflow: %s\n", cntrl & CNTRL_R ? "not loaded" : "overflow marker"); v4l2_info(sd, "\tFIFO interrupt watermark: %s\n", cntrl & CNTRL_RIC ? "not empty" : "half full or greater"); v4l2_info(sd, "\tLoopback mode: %s\n", cntrl & CNTRL_LBM ? "loopback active" : "normal receive"); if (cntrl & CNTRL_DMD) { v4l2_info(sd, "\tExpected carrier (16 clocks): %u Hz\n", clock_divider_to_carrier_freq(rxclk)); switch (cntrl & CNTRL_WIN) { case CNTRL_WIN_3_3: i = 3; j = 3; break; case CNTRL_WIN_4_3: i = 4; j = 3; break; case CNTRL_WIN_3_4: i = 3; j = 4; break; case CNTRL_WIN_4_4: i = 4; j = 4; break; default: i = 0; j = 0; break; } v4l2_info(sd, "\tNext carrier edge window: 16 clocks -%1d/+%1d, %u to %u Hz\n", i, j, clock_divider_to_freq(rxclk, 16 + j), clock_divider_to_freq(rxclk, 16 - i)); } v4l2_info(sd, "\tMax measurable pulse width: %u us, %llu ns\n", pulse_width_count_to_us(FIFO_RXTX, rxclk), pulse_width_count_to_ns(FIFO_RXTX, rxclk)); v4l2_info(sd, "\tLow pass filter: %s\n", filtr ? "enabled" : "disabled"); if (filtr) v4l2_info(sd, "\tMin acceptable pulse width (LPF): %u us, %u ns\n", lpf_count_to_us(filtr), lpf_count_to_ns(filtr)); v4l2_info(sd, "\tPulse width timer timed-out: %s\n", stats & STATS_RTO ? "yes" : "no"); v4l2_info(sd, "\tPulse width timer time-out intr: %s\n", irqen & IRQEN_RTE ? "enabled" : "disabled"); v4l2_info(sd, "\tFIFO overrun: %s\n", stats & STATS_ROR ? "yes" : "no"); v4l2_info(sd, "\tFIFO overrun interrupt: %s\n", irqen & IRQEN_ROE ? "enabled" : "disabled"); v4l2_info(sd, "\tBusy: %s\n", stats & STATS_RBY ? "yes" : "no"); v4l2_info(sd, "\tFIFO service requested: %s\n", stats & STATS_RSR ? "yes" : "no"); v4l2_info(sd, "\tFIFO service request interrupt: %s\n", irqen & IRQEN_RSE ? "enabled" : "disabled"); v4l2_info(sd, "IR Transmitter:\n"); v4l2_info(sd, "\tEnabled: %s\n", cntrl & CNTRL_TXE ? "yes" : "no"); v4l2_info(sd, "\tModulation onto a carrier: %s\n", cntrl & CNTRL_MOD ? "enabled" : "disabled"); v4l2_info(sd, "\tFIFO: %s\n", cntrl & CNTRL_TFE ? "enabled" : "disabled"); v4l2_info(sd, "\tFIFO interrupt watermark: %s\n", cntrl & CNTRL_TIC ? "not empty" : "half full or less"); v4l2_info(sd, "\tCarrier polarity: %s\n", cntrl & CNTRL_CPL ? "space:burst mark:noburst" : "space:noburst mark:burst"); if (cntrl & CNTRL_MOD) { v4l2_info(sd, "\tCarrier (16 clocks): %u Hz\n", clock_divider_to_carrier_freq(txclk)); v4l2_info(sd, "\tCarrier duty cycle: %2u/16\n", cduty + 1); } v4l2_info(sd, "\tMax pulse width: %u us, %llu ns\n", pulse_width_count_to_us(FIFO_RXTX, txclk), pulse_width_count_to_ns(FIFO_RXTX, txclk)); v4l2_info(sd, "\tBusy: %s\n", stats & STATS_TBY ? "yes" : "no"); v4l2_info(sd, "\tFIFO service requested: %s\n", stats & STATS_TSR ? "yes" : "no"); v4l2_info(sd, "\tFIFO service request interrupt: %s\n", irqen & IRQEN_TSE ? "enabled" : "disabled"); return 0; } const struct v4l2_subdev_ir_ops cx25840_ir_ops = { .rx_read = cx25840_ir_rx_read, .rx_g_parameters = cx25840_ir_rx_g_parameters, .rx_s_parameters = cx25840_ir_rx_s_parameters, .tx_write = cx25840_ir_tx_write, .tx_g_parameters = cx25840_ir_tx_g_parameters, .tx_s_parameters = cx25840_ir_tx_s_parameters, }; static const struct v4l2_subdev_ir_parameters default_rx_params = { .bytes_per_data_element = sizeof(union cx25840_ir_fifo_rec), .mode = V4L2_SUBDEV_IR_MODE_PULSE_WIDTH, .enable = false, .interrupt_enable = false, .shutdown = true, .modulation = true, .carrier_freq = 36000, /* 36 kHz - RC-5, and RC-6 carrier */ /* RC-5: 666,667 ns = 1/36 kHz * 32 cycles * 1 mark * 0.75 */ /* RC-6: 333,333 ns = 1/36 kHz * 16 cycles * 1 mark * 0.75 */ .noise_filter_min_width = 333333, /* ns */ .carrier_range_lower = 35000, .carrier_range_upper = 37000, .invert_level = false, }; static const struct v4l2_subdev_ir_parameters default_tx_params = { .bytes_per_data_element = sizeof(union cx25840_ir_fifo_rec), .mode = V4L2_SUBDEV_IR_MODE_PULSE_WIDTH, .enable = false, .interrupt_enable = false, .shutdown = true, .modulation = true, .carrier_freq = 36000, /* 36 kHz - RC-5 carrier */ .duty_cycle = 25, /* 25 % - RC-5 carrier */ .invert_level = false, .invert_carrier_sense = false, }; int cx25840_ir_probe(struct v4l2_subdev *sd) { struct cx25840_state *state = to_state(sd); struct cx25840_ir_state *ir_state; struct v4l2_subdev_ir_parameters default_params; /* Only init the IR controller for the CX2388[57] AV Core for now */ if (!(is_cx23885(state) || is_cx23887(state))) return 0; ir_state = devm_kzalloc(&state->c->dev, sizeof(*ir_state), GFP_KERNEL); if (ir_state == NULL) return -ENOMEM; spin_lock_init(&ir_state->rx_kfifo_lock); if (kfifo_alloc(&ir_state->rx_kfifo, CX25840_IR_RX_KFIFO_SIZE, GFP_KERNEL)) return -ENOMEM; ir_state->c = state->c; state->ir_state = ir_state; /* Ensure no interrupts arrive yet */ if (is_cx23885(state) || is_cx23887(state)) cx25840_write4(ir_state->c, CX25840_IR_IRQEN_REG, IRQEN_MSK); else cx25840_write4(ir_state->c, CX25840_IR_IRQEN_REG, 0); mutex_init(&ir_state->rx_params_lock); default_params = default_rx_params; v4l2_subdev_call(sd, ir, rx_s_parameters, &default_params); mutex_init(&ir_state->tx_params_lock); default_params = default_tx_params; v4l2_subdev_call(sd, ir, tx_s_parameters, &default_params); return 0; } int cx25840_ir_remove(struct v4l2_subdev *sd) { struct cx25840_state *state = to_state(sd); struct cx25840_ir_state *ir_state = to_ir_state(sd); if (ir_state == NULL) return -ENODEV; cx25840_ir_rx_shutdown(sd); cx25840_ir_tx_shutdown(sd); kfifo_free(&ir_state->rx_kfifo); state->ir_state = NULL; return 0; }
41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0+ /* * linux/net/sunrpc/gss_rpc_upcall.c * * Copyright (C) 2012 Simo Sorce <simo@redhat.com> */ #include <linux/types.h> #include <linux/un.h> #include <linux/sunrpc/svcauth.h> #include "gss_rpc_upcall.h" #define GSSPROXY_SOCK_PATHNAME "/var/run/gssproxy.sock" #define GSSPROXY_PROGRAM (400112u) #define GSSPROXY_VERS_1 (1u) /* * Encoding/Decoding functions */ enum { GSSX_NULL = 0, /* Unused */ GSSX_INDICATE_MECHS = 1, GSSX_GET_CALL_CONTEXT = 2, GSSX_IMPORT_AND_CANON_NAME = 3, GSSX_EXPORT_CRED = 4, GSSX_IMPORT_CRED = 5, GSSX_ACQUIRE_CRED = 6, GSSX_STORE_CRED = 7, GSSX_INIT_SEC_CONTEXT = 8, GSSX_ACCEPT_SEC_CONTEXT = 9, GSSX_RELEASE_HANDLE = 10, GSSX_GET_MIC = 11, GSSX_VERIFY = 12, GSSX_WRAP = 13, GSSX_UNWRAP = 14, GSSX_WRAP_SIZE_LIMIT = 15, }; #define PROC(proc, name) \ [GSSX_##proc] = { \ .p_proc = GSSX_##proc, \ .p_encode = gssx_enc_##name, \ .p_decode = gssx_dec_##name, \ .p_arglen = GSSX_ARG_##name##_sz, \ .p_replen = GSSX_RES_##name##_sz, \ .p_statidx = GSSX_##proc, \ .p_name = #proc, \ } static const struct rpc_procinfo gssp_procedures[] = { PROC(INDICATE_MECHS, indicate_mechs), PROC(GET_CALL_CONTEXT, get_call_context), PROC(IMPORT_AND_CANON_NAME, import_and_canon_name), PROC(EXPORT_CRED, export_cred), PROC(IMPORT_CRED, import_cred), PROC(ACQUIRE_CRED, acquire_cred), PROC(STORE_CRED, store_cred), PROC(INIT_SEC_CONTEXT, init_sec_context), PROC(ACCEPT_SEC_CONTEXT, accept_sec_context), PROC(RELEASE_HANDLE, release_handle), PROC(GET_MIC, get_mic), PROC(VERIFY, verify), PROC(WRAP, wrap), PROC(UNWRAP, unwrap), PROC(WRAP_SIZE_LIMIT, wrap_size_limit), }; /* * Common transport functions */ static const struct rpc_program gssp_program; static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) { static const struct sockaddr_un gssp_localaddr = { .sun_family = AF_LOCAL, .sun_path = GSSPROXY_SOCK_PATHNAME, }; struct rpc_create_args args = { .net = net, .protocol = XPRT_TRANSPORT_LOCAL, .address = (struct sockaddr *)&gssp_localaddr, .addrsize = sizeof(gssp_localaddr), .servername = "localhost", .program = &gssp_program, .version = GSSPROXY_VERS_1, .authflavor = RPC_AUTH_NULL, /* * Note we want connection to be done in the caller's * filesystem namespace. We therefore turn off the idle * timeout, which would result in reconnections being * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; int result = 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("RPC: failed to create AF_LOCAL gssproxy " "client (errno %ld).\n", PTR_ERR(clnt)); result = PTR_ERR(clnt); *_clnt = NULL; goto out; } dprintk("RPC: created new gssp local client (gssp_local_clnt: " "%p)\n", clnt); *_clnt = clnt; out: return result; } void init_gssp_clnt(struct sunrpc_net *sn) { mutex_init(&sn->gssp_lock); sn->gssp_clnt = NULL; } int set_gssp_clnt(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int ret; mutex_lock(&sn->gssp_lock); ret = gssp_rpc_create(net, &clnt); if (!ret) { if (sn->gssp_clnt) rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = clnt; } mutex_unlock(&sn->gssp_lock); return ret; } void clear_gssp_clnt(struct sunrpc_net *sn) { mutex_lock(&sn->gssp_lock); if (sn->gssp_clnt) { rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = NULL; } mutex_unlock(&sn->gssp_lock); } static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) { struct rpc_clnt *clnt; mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } static int gssp_call(struct net *net, struct rpc_message *msg) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int status; clnt = get_gssp_clnt(sn); if (!clnt) return -EIO; status = rpc_call_sync(clnt, msg, 0); if (status < 0) { dprintk("gssp: rpc_call returned error %d\n", -status); switch (status) { case -EPROTONOSUPPORT: status = -EINVAL; break; case -ECONNREFUSED: case -ETIMEDOUT: case -ENOTCONN: status = -EAGAIN; break; case -ERESTARTSYS: if (signalled ()) status = -EINTR; break; default: break; } } rpc_release_client(clnt); return status; } static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); kfree(arg->pages); } static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); if (!arg->pages) return -ENOMEM; for (i = 0; i < arg->npages; i++) { arg->pages[i] = alloc_page(GFP_KERNEL); if (!arg->pages[i]) { gssp_free_receive_pages(arg); return -ENOMEM; } } return 0; } static char *gssp_stringify(struct xdr_netobj *netobj) { return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL); } static void gssp_hostbased_service(char **principal) { char *c; if (!*principal) return; /* terminate and remove realm part */ c = strchr(*principal, '@'); if (c) { *c = '\0'; /* change service-hostname delimiter */ c = strchr(*principal, '/'); if (c) *c = '@'; } if (!c) { /* not a service principal */ kfree(*principal); *principal = NULL; } } /* * Public functions */ /* numbers somewhat arbitrary but large enough for current needs */ #define GSSX_MAX_OUT_HANDLE 128 #define GSSX_MAX_SRC_PRINC 256 #define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \ GSSX_max_oid_sz + \ GSSX_max_princ_sz + \ sizeof(struct svc_cred)) int gssp_accept_sec_context_upcall(struct net *net, struct gssp_upcall_data *data) { struct gssx_ctx ctxh = { .state = data->in_handle }; struct gssx_arg_accept_sec_context arg = { .input_token = data->in_token, }; struct gssx_ctx rctxh = { /* * pass in the max length we expect for each of these * buffers but let the xdr code kmalloc them: */ .exported_context_token.len = GSSX_max_output_handle_sz, .mech.len = GSS_OID_MAX_LEN, .targ_name.display_name.len = GSSX_max_princ_sz, .src_name.display_name.len = GSSX_max_princ_sz }; struct gssx_res_accept_sec_context res = { .context_handle = &rctxh, .output_token = &data->out_token }; struct rpc_message msg = { .rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT], .rpc_argp = &arg, .rpc_resp = &res, .rpc_cred = NULL, /* FIXME ? */ }; struct xdr_netobj client_name = { 0 , NULL }; struct xdr_netobj target_name = { 0, NULL }; int ret; if (data->in_handle.len != 0) arg.context_handle = &ctxh; res.output_token->len = GSSX_max_output_token_sz; ret = gssp_alloc_receive_pages(&arg); if (ret) return ret; ret = gssp_call(net, &msg); gssp_free_receive_pages(&arg); /* we need to fetch all data even in case of error so * that we can free special strctures is they have been allocated */ data->major_status = res.status.major_status; data->minor_status = res.status.minor_status; if (res.context_handle) { data->out_handle = rctxh.exported_context_token; data->mech_oid.len = rctxh.mech.len; if (rctxh.mech.data) { memcpy(data->mech_oid.data, rctxh.mech.data, data->mech_oid.len); kfree(rctxh.mech.data); } client_name = rctxh.src_name.display_name; target_name = rctxh.targ_name.display_name; } if (res.options.count == 1) { gssx_buffer *value = &res.options.data[0].value; /* Currently we only decode CREDS_VALUE, if we add * anything else we'll have to loop and match on the * option name */ if (value->len == 1) { /* steal group info from struct svc_cred */ data->creds = *(struct svc_cred *)value->data; data->found_creds = 1; } /* whether we use it or not, free data */ kfree(value->data); } if (res.options.count != 0) { kfree(res.options.data); } /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */ if (data->found_creds) { if (client_name.data) { data->creds.cr_raw_principal = gssp_stringify(&client_name); data->creds.cr_principal = gssp_stringify(&client_name); gssp_hostbased_service(&data->creds.cr_principal); } if (target_name.data) { data->creds.cr_targ_princ = gssp_stringify(&target_name); gssp_hostbased_service(&data->creds.cr_targ_princ); } } kfree(client_name.data); kfree(target_name.data); return ret; } void gssp_free_upcall_data(struct gssp_upcall_data *data) { kfree(data->in_handle.data); kfree(data->out_handle.data); kfree(data->out_token.data); free_svc_cred(&data->creds); } /* * Initialization stuff */ static unsigned int gssp_version1_counts[ARRAY_SIZE(gssp_procedures)]; static const struct rpc_version gssp_version1 = { .number = GSSPROXY_VERS_1, .nrprocs = ARRAY_SIZE(gssp_procedures), .procs = gssp_procedures, .counts = gssp_version1_counts, }; static const struct rpc_version *gssp_version[] = { NULL, &gssp_version1, }; static struct rpc_stat gssp_stats; static const struct rpc_program gssp_program = { .name = "gssproxy", .number = GSSPROXY_PROGRAM, .nrvers = ARRAY_SIZE(gssp_version), .version = gssp_version, .stats = &gssp_stats, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 /* SPDX-License-Identifier: GPL-2.0-only */ /* * linux/include/linux/clk.h * * Copyright (C) 2004 ARM Limited. * Written by Deep Blue Solutions Limited. * Copyright (C) 2011-2012 Linaro Ltd <mturquette@linaro.org> */ #ifndef __LINUX_CLK_H #define __LINUX_CLK_H #include <linux/err.h> #include <linux/kernel.h> #include <linux/notifier.h> struct device; struct clk; struct device_node; struct of_phandle_args; /** * DOC: clk notifier callback types * * PRE_RATE_CHANGE - called immediately before the clk rate is changed, * to indicate that the rate change will proceed. Drivers must * immediately terminate any operations that will be affected by the * rate change. Callbacks may either return NOTIFY_DONE, NOTIFY_OK, * NOTIFY_STOP or NOTIFY_BAD. * * ABORT_RATE_CHANGE: called if the rate change failed for some reason * after PRE_RATE_CHANGE. In this case, all registered notifiers on * the clk will be called with ABORT_RATE_CHANGE. Callbacks must * always return NOTIFY_DONE or NOTIFY_OK. * * POST_RATE_CHANGE - called after the clk rate change has successfully * completed. Callbacks must always return NOTIFY_DONE or NOTIFY_OK. * */ #define PRE_RATE_CHANGE BIT(0) #define POST_RATE_CHANGE BIT(1) #define ABORT_RATE_CHANGE BIT(2) /** * struct clk_notifier - associate a clk with a notifier * @clk: struct clk * to associate the notifier with * @notifier_head: a blocking_notifier_head for this clk * @node: linked list pointers * * A list of struct clk_notifier is maintained by the notifier code. * An entry is created whenever code registers the first notifier on a * particular @clk. Future notifiers on that @clk are added to the * @notifier_head. */ struct clk_notifier { struct clk *clk; struct srcu_notifier_head notifier_head; struct list_head node; }; /** * struct clk_notifier_data - rate data to pass to the notifier callback * @clk: struct clk * being changed * @old_rate: previous rate of this clk * @new_rate: new rate of this clk * * For a pre-notifier, old_rate is the clk's rate before this rate * change, and new_rate is what the rate will be in the future. For a * post-notifier, old_rate and new_rate are both set to the clk's * current rate (this was done to optimize the implementation). */ struct clk_notifier_data { struct clk *clk; unsigned long old_rate; unsigned long new_rate; }; /** * struct clk_bulk_data - Data used for bulk clk operations. * * @id: clock consumer ID * @clk: struct clk * to store the associated clock * * The CLK APIs provide a series of clk_bulk_() API calls as * a convenience to consumers which require multiple clks. This * structure is used to manage data for these calls. */ struct clk_bulk_data { const char *id; struct clk *clk; }; #ifdef CONFIG_COMMON_CLK /** * clk_notifier_register - register a clock rate-change notifier callback * @clk: clock whose rate we are interested in * @nb: notifier block with callback function pointer * * ProTip: debugging across notifier chains can be frustrating. Make sure that * your notifier callback function prints a nice big warning in case of * failure. */ int clk_notifier_register(struct clk *clk, struct notifier_block *nb); /** * clk_notifier_unregister - unregister a clock rate-change notifier callback * @clk: clock whose rate we are no longer interested in * @nb: notifier block which will be unregistered */ int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb); /** * devm_clk_notifier_register - register a managed rate-change notifier callback * @dev: device for clock "consumer" * @clk: clock whose rate we are interested in * @nb: notifier block with callback function pointer * * Returns 0 on success, -EERROR otherwise */ int devm_clk_notifier_register(struct device *dev, struct clk *clk, struct notifier_block *nb); /** * clk_get_accuracy - obtain the clock accuracy in ppb (parts per billion) * for a clock source. * @clk: clock source * * This gets the clock source accuracy expressed in ppb. * A perfect clock returns 0. */ long clk_get_accuracy(struct clk *clk); /** * clk_set_phase - adjust the phase shift of a clock signal * @clk: clock signal source * @degrees: number of degrees the signal is shifted * * Shifts the phase of a clock signal by the specified degrees. Returns 0 on * success, -EERROR otherwise. */ int clk_set_phase(struct clk *clk, int degrees); /** * clk_get_phase - return the phase shift of a clock signal * @clk: clock signal source * * Returns the phase shift of a clock node in degrees, otherwise returns * -EERROR. */ int clk_get_phase(struct clk *clk); /** * clk_set_duty_cycle - adjust the duty cycle ratio of a clock signal * @clk: clock signal source * @num: numerator of the duty cycle ratio to be applied * @den: denominator of the duty cycle ratio to be applied * * Adjust the duty cycle of a clock signal by the specified ratio. Returns 0 on * success, -EERROR otherwise. */ int clk_set_duty_cycle(struct clk *clk, unsigned int num, unsigned int den); /** * clk_get_scaled_duty_cycle - return the duty cycle ratio of a clock signal * @clk: clock signal source * @scale: scaling factor to be applied to represent the ratio as an integer * * Returns the duty cycle ratio multiplied by the scale provided, otherwise * returns -EERROR. */ int clk_get_scaled_duty_cycle(struct clk *clk, unsigned int scale); /** * clk_is_match - check if two clk's point to the same hardware clock * @p: clk compared against q * @q: clk compared against p * * Returns true if the two struct clk pointers both point to the same hardware * clock node. Put differently, returns true if @p and @q * share the same &struct clk_core object. * * Returns false otherwise. Note that two NULL clks are treated as matching. */ bool clk_is_match(const struct clk *p, const struct clk *q); /** * clk_rate_exclusive_get - get exclusivity over the rate control of a * producer * @clk: clock source * * This function allows drivers to get exclusive control over the rate of a * provider. It prevents any other consumer to execute, even indirectly, * opereation which could alter the rate of the provider or cause glitches * * If exlusivity is claimed more than once on clock, even by the same driver, * the rate effectively gets locked as exclusivity can't be preempted. * * Must not be called from within atomic context. * * Returns success (0) or negative errno. */ int clk_rate_exclusive_get(struct clk *clk); /** * devm_clk_rate_exclusive_get - devm variant of clk_rate_exclusive_get * @dev: device the exclusivity is bound to * @clk: clock source * * Calls clk_rate_exclusive_get() on @clk and registers a devm cleanup handler * on @dev to call clk_rate_exclusive_put(). * * Must not be called from within atomic context. */ int devm_clk_rate_exclusive_get(struct device *dev, struct clk *clk); /** * clk_rate_exclusive_put - release exclusivity over the rate control of a * producer * @clk: clock source * * This function allows drivers to release the exclusivity it previously got * from clk_rate_exclusive_get() * * The caller must balance the number of clk_rate_exclusive_get() and * clk_rate_exclusive_put() calls. * * Must not be called from within atomic context. */ void clk_rate_exclusive_put(struct clk *clk); #else static inline int clk_notifier_register(struct clk *clk, struct notifier_block *nb) { return -ENOTSUPP; } static inline int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb) { return -ENOTSUPP; } static inline int devm_clk_notifier_register(struct device *dev, struct clk *clk, struct notifier_block *nb) { return -ENOTSUPP; } static inline long clk_get_accuracy(struct clk *clk) { return -ENOTSUPP; } static inline long clk_set_phase(struct clk *clk, int phase) { return -ENOTSUPP; } static inline long clk_get_phase(struct clk *clk) { return -ENOTSUPP; } static inline int clk_set_duty_cycle(struct clk *clk, unsigned int num, unsigned int den) { return -ENOTSUPP; } static inline unsigned int clk_get_scaled_duty_cycle(struct clk *clk, unsigned int scale) { return 0; } static inline bool clk_is_match(const struct clk *p, const struct clk *q) { return p == q; } static inline int clk_rate_exclusive_get(struct clk *clk) { return 0; } static inline int devm_clk_rate_exclusive_get(struct device *dev, struct clk *clk) { return 0; } static inline void clk_rate_exclusive_put(struct clk *clk) {} #endif #ifdef CONFIG_HAVE_CLK_PREPARE /** * clk_prepare - prepare a clock source * @clk: clock source * * This prepares the clock source for use. * * Must not be called from within atomic context. */ int clk_prepare(struct clk *clk); int __must_check clk_bulk_prepare(int num_clks, const struct clk_bulk_data *clks); /** * clk_is_enabled_when_prepared - indicate if preparing a clock also enables it. * @clk: clock source * * Returns true if clk_prepare() implicitly enables the clock, effectively * making clk_enable()/clk_disable() no-ops, false otherwise. * * This is of interest mainly to the power management code where actually * disabling the clock also requires unpreparing it to have any material * effect. * * Regardless of the value returned here, the caller must always invoke * clk_enable() or clk_prepare_enable() and counterparts for usage counts * to be right. */ bool clk_is_enabled_when_prepared(struct clk *clk); #else static inline int clk_prepare(struct clk *clk) { might_sleep(); return 0; } static inline int __must_check clk_bulk_prepare(int num_clks, const struct clk_bulk_data *clks) { might_sleep(); return 0; } static inline bool clk_is_enabled_when_prepared(struct clk *clk) { return false; } #endif /** * clk_unprepare - undo preparation of a clock source * @clk: clock source * * This undoes a previously prepared clock. The caller must balance * the number of prepare and unprepare calls. * * Must not be called from within atomic context. */ #ifdef CONFIG_HAVE_CLK_PREPARE void clk_unprepare(struct clk *clk); void clk_bulk_unprepare(int num_clks, const struct clk_bulk_data *clks); #else static inline void clk_unprepare(struct clk *clk) { might_sleep(); } static inline void clk_bulk_unprepare(int num_clks, const struct clk_bulk_data *clks) { might_sleep(); } #endif #ifdef CONFIG_HAVE_CLK /** * clk_get - lookup and obtain a reference to a clock producer. * @dev: device for clock "consumer" * @id: clock consumer ID * * Returns a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. (IOW, @id may be identical strings, but * clk_get may return different clock producers depending on @dev.) * * Drivers must assume that the clock source is not enabled. * * clk_get should not be called from within interrupt context. */ struct clk *clk_get(struct device *dev, const char *id); /** * clk_bulk_get - lookup and obtain a number of references to clock producer. * @dev: device for clock "consumer" * @num_clks: the number of clk_bulk_data * @clks: the clk_bulk_data table of consumer * * This helper function allows drivers to get several clk consumers in one * operation. If any of the clk cannot be acquired then any clks * that were obtained will be freed before returning to the caller. * * Returns 0 if all clocks specified in clk_bulk_data table are obtained * successfully, or valid IS_ERR() condition containing errno. * The implementation uses @dev and @clk_bulk_data.id to determine the * clock consumer, and thereby the clock producer. * The clock returned is stored in each @clk_bulk_data.clk field. * * Drivers must assume that the clock source is not enabled. * * clk_bulk_get should not be called from within interrupt context. */ int __must_check clk_bulk_get(struct device *dev, int num_clks, struct clk_bulk_data *clks); /** * clk_bulk_get_all - lookup and obtain all available references to clock * producer. * @dev: device for clock "consumer" * @clks: pointer to the clk_bulk_data table of consumer * * This helper function allows drivers to get all clk consumers in one * operation. If any of the clk cannot be acquired then any clks * that were obtained will be freed before returning to the caller. * * Returns a positive value for the number of clocks obtained while the * clock references are stored in the clk_bulk_data table in @clks field. * Returns 0 if there're none and a negative value if something failed. * * Drivers must assume that the clock source is not enabled. * * clk_bulk_get should not be called from within interrupt context. */ int __must_check clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks); /** * clk_bulk_get_optional - lookup and obtain a number of references to clock producer * @dev: device for clock "consumer" * @num_clks: the number of clk_bulk_data * @clks: the clk_bulk_data table of consumer * * Behaves the same as clk_bulk_get() except where there is no clock producer. * In this case, instead of returning -ENOENT, the function returns 0 and * NULL for a clk for which a clock producer could not be determined. */ int __must_check clk_bulk_get_optional(struct device *dev, int num_clks, struct clk_bulk_data *clks); /** * devm_clk_bulk_get - managed get multiple clk consumers * @dev: device for clock "consumer" * @num_clks: the number of clk_bulk_data * @clks: the clk_bulk_data table of consumer * * Return 0 on success, an errno on failure. * * This helper function allows drivers to get several clk * consumers in one operation with management, the clks will * automatically be freed when the device is unbound. */ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, struct clk_bulk_data *clks); /** * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks * @dev: device for clock "consumer" * @num_clks: the number of clk_bulk_data * @clks: pointer to the clk_bulk_data table of consumer * * Behaves the same as devm_clk_bulk_get() except where there is no clock * producer. In this case, instead of returning -ENOENT, the function returns * NULL for given clk. It is assumed all clocks in clk_bulk_data are optional. * * Returns 0 if all clocks specified in clk_bulk_data table are obtained * successfully or for any clk there was no clk provider available, otherwise * returns valid IS_ERR() condition containing errno. * The implementation uses @dev and @clk_bulk_data.id to determine the * clock consumer, and thereby the clock producer. * The clock returned is stored in each @clk_bulk_data.clk field. * * Drivers must assume that the clock source is not enabled. * * clk_bulk_get should not be called from within interrupt context. */ int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks, struct clk_bulk_data *clks); /** * devm_clk_bulk_get_all - managed get multiple clk consumers * @dev: device for clock "consumer" * @clks: pointer to the clk_bulk_data table of consumer * * Returns a positive value for the number of clocks obtained while the * clock references are stored in the clk_bulk_data table in @clks field. * Returns 0 if there're none and a negative value if something failed. * * This helper function allows drivers to get several clk * consumers in one operation with management, the clks will * automatically be freed when the device is unbound. */ int __must_check devm_clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks); /** * devm_clk_bulk_get_all_enabled - Get and enable all clocks of the consumer (managed) * @dev: device for clock "consumer" * @clks: pointer to the clk_bulk_data table of consumer * * Returns a positive value for the number of clocks obtained while the * clock references are stored in the clk_bulk_data table in @clks field. * Returns 0 if there're none and a negative value if something failed. * * This helper function allows drivers to get all clocks of the * consumer and enables them in one operation with management. * The clks will automatically be disabled and freed when the device * is unbound. */ int __must_check devm_clk_bulk_get_all_enabled(struct device *dev, struct clk_bulk_data **clks); /** * devm_clk_get - lookup and obtain a managed reference to a clock producer. * @dev: device for clock "consumer" * @id: clock consumer ID * * Context: May sleep. * * Return: a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. (IOW, @id may be identical strings, but * clk_get may return different clock producers depending on @dev.) * * Drivers must assume that the clock source is neither prepared nor * enabled. * * The clock will automatically be freed when the device is unbound * from the bus. */ struct clk *devm_clk_get(struct device *dev, const char *id); /** * devm_clk_get_prepared - devm_clk_get() + clk_prepare() * @dev: device for clock "consumer" * @id: clock consumer ID * * Context: May sleep. * * Return: a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. (IOW, @id may be identical strings, but * clk_get may return different clock producers depending on @dev.) * * The returned clk (if valid) is prepared. Drivers must however assume * that the clock is not enabled. * * The clock will automatically be unprepared and freed when the device * is unbound from the bus. */ struct clk *devm_clk_get_prepared(struct device *dev, const char *id); /** * devm_clk_get_enabled - devm_clk_get() + clk_prepare_enable() * @dev: device for clock "consumer" * @id: clock consumer ID * * Context: May sleep. * * Return: a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. (IOW, @id may be identical strings, but * clk_get may return different clock producers depending on @dev.) * * The returned clk (if valid) is prepared and enabled. * * The clock will automatically be disabled, unprepared and freed * when the device is unbound from the bus. */ struct clk *devm_clk_get_enabled(struct device *dev, const char *id); /** * devm_clk_get_optional - lookup and obtain a managed reference to an optional * clock producer. * @dev: device for clock "consumer" * @id: clock consumer ID * * Context: May sleep. * * Return: a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. If no such clk is found, it returns NULL * which serves as a dummy clk. That's the only difference compared * to devm_clk_get(). * * Drivers must assume that the clock source is neither prepared nor * enabled. * * The clock will automatically be freed when the device is unbound * from the bus. */ struct clk *devm_clk_get_optional(struct device *dev, const char *id); /** * devm_clk_get_optional_prepared - devm_clk_get_optional() + clk_prepare() * @dev: device for clock "consumer" * @id: clock consumer ID * * Context: May sleep. * * Return: a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. If no such clk is found, it returns NULL * which serves as a dummy clk. That's the only difference compared * to devm_clk_get_prepared(). * * The returned clk (if valid) is prepared. Drivers must however * assume that the clock is not enabled. * * The clock will automatically be unprepared and freed when the * device is unbound from the bus. */ struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id); /** * devm_clk_get_optional_enabled - devm_clk_get_optional() + * clk_prepare_enable() * @dev: device for clock "consumer" * @id: clock consumer ID * * Context: May sleep. * * Return: a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. If no such clk is found, it returns NULL * which serves as a dummy clk. That's the only difference compared * to devm_clk_get_enabled(). * * The returned clk (if valid) is prepared and enabled. * * The clock will automatically be disabled, unprepared and freed * when the device is unbound from the bus. */ struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id); /** * devm_clk_get_optional_enabled_with_rate - devm_clk_get_optional() + * clk_set_rate() + * clk_prepare_enable() * @dev: device for clock "consumer" * @id: clock consumer ID * @rate: new clock rate * * Context: May sleep. * * Return: a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev and @id to determine the clock consumer, and thereby * the clock producer. If no such clk is found, it returns NULL * which serves as a dummy clk. That's the only difference compared * to devm_clk_get_enabled(). * * The returned clk (if valid) is prepared and enabled and rate was set. * * The clock will automatically be disabled, unprepared and freed * when the device is unbound from the bus. */ struct clk *devm_clk_get_optional_enabled_with_rate(struct device *dev, const char *id, unsigned long rate); /** * devm_get_clk_from_child - lookup and obtain a managed reference to a * clock producer from child node. * @dev: device for clock "consumer" * @np: pointer to clock consumer node * @con_id: clock consumer ID * * This function parses the clocks, and uses them to look up the * struct clk from the registered list of clock providers by using * @np and @con_id * * The clock will automatically be freed when the device is unbound * from the bus. */ struct clk *devm_get_clk_from_child(struct device *dev, struct device_node *np, const char *con_id); /** * clk_enable - inform the system when the clock source should be running. * @clk: clock source * * If the clock can not be enabled/disabled, this should return success. * * May be called from atomic contexts. * * Returns success (0) or negative errno. */ int clk_enable(struct clk *clk); /** * clk_bulk_enable - inform the system when the set of clks should be running. * @num_clks: the number of clk_bulk_data * @clks: the clk_bulk_data table of consumer * * May be called from atomic contexts. * * Returns success (0) or negative errno. */ int __must_check clk_bulk_enable(int num_clks, const struct clk_bulk_data *clks); /** * clk_disable - inform the system when the clock source is no longer required. * @clk: clock source * * Inform the system that a clock source is no longer required by * a driver and may be shut down. * * May be called from atomic contexts. * * Implementation detail: if the clock source is shared between * multiple drivers, clk_enable() calls must be balanced by the * same number of clk_disable() calls for the clock source to be * disabled. */ void clk_disable(struct clk *clk); /** * clk_bulk_disable - inform the system when the set of clks is no * longer required. * @num_clks: the number of clk_bulk_data * @clks: the clk_bulk_data table of consumer * * Inform the system that a set of clks is no longer required by * a driver and may be shut down. * * May be called from atomic contexts. * * Implementation detail: if the set of clks is shared between * multiple drivers, clk_bulk_enable() calls must be balanced by the * same number of clk_bulk_disable() calls for the clock source to be * disabled. */ void clk_bulk_disable(int num_clks, const struct clk_bulk_data *clks); /** * clk_get_rate - obtain the current clock rate (in Hz) for a clock source. * This is only valid once the clock source has been enabled. * @clk: clock source */ unsigned long clk_get_rate(struct clk *clk); /** * clk_put - "free" the clock source * @clk: clock source * * Note: drivers must ensure that all clk_enable calls made on this * clock source are balanced by clk_disable calls prior to calling * this function. * * clk_put should not be called from within interrupt context. */ void clk_put(struct clk *clk); /** * clk_bulk_put - "free" the clock source * @num_clks: the number of clk_bulk_data * @clks: the clk_bulk_data table of consumer * * Note: drivers must ensure that all clk_bulk_enable calls made on this * clock source are balanced by clk_bulk_disable calls prior to calling * this function. * * clk_bulk_put should not be called from within interrupt context. */ void clk_bulk_put(int num_clks, struct clk_bulk_data *clks); /** * clk_bulk_put_all - "free" all the clock source * @num_clks: the number of clk_bulk_data * @clks: the clk_bulk_data table of consumer * * Note: drivers must ensure that all clk_bulk_enable calls made on this * clock source are balanced by clk_bulk_disable calls prior to calling * this function. * * clk_bulk_put_all should not be called from within interrupt context. */ void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks); /** * devm_clk_put - "free" a managed clock source * @dev: device used to acquire the clock * @clk: clock source acquired with devm_clk_get() * * Note: drivers must ensure that all clk_enable calls made on this * clock source are balanced by clk_disable calls prior to calling * this function. * * clk_put should not be called from within interrupt context. */ void devm_clk_put(struct device *dev, struct clk *clk); /* * The remaining APIs are optional for machine class support. */ /** * clk_round_rate - adjust a rate to the exact rate a clock can provide * @clk: clock source * @rate: desired clock rate in Hz * * This answers the question "if I were to pass @rate to clk_set_rate(), * what clock rate would I end up with?" without changing the hardware * in any way. In other words: * * rate = clk_round_rate(clk, r); * * and: * * clk_set_rate(clk, r); * rate = clk_get_rate(clk); * * are equivalent except the former does not modify the clock hardware * in any way. * * Returns rounded clock rate in Hz, or negative errno. */ long clk_round_rate(struct clk *clk, unsigned long rate); /** * clk_set_rate - set the clock rate for a clock source * @clk: clock source * @rate: desired clock rate in Hz * * Updating the rate starts at the top-most affected clock and then * walks the tree down to the bottom-most clock that needs updating. * * Returns success (0) or negative errno. */ int clk_set_rate(struct clk *clk, unsigned long rate); /** * clk_set_rate_exclusive- set the clock rate and claim exclusivity over * clock source * @clk: clock source * @rate: desired clock rate in Hz * * This helper function allows drivers to atomically set the rate of a producer * and claim exclusivity over the rate control of the producer. * * It is essentially a combination of clk_set_rate() and * clk_rate_exclusite_get(). Caller must balance this call with a call to * clk_rate_exclusive_put() * * Returns success (0) or negative errno. */ int clk_set_rate_exclusive(struct clk *clk, unsigned long rate); /** * clk_has_parent - check if a clock is a possible parent for another * @clk: clock source * @parent: parent clock source * * This function can be used in drivers that need to check that a clock can be * the parent of another without actually changing the parent. * * Returns true if @parent is a possible parent for @clk, false otherwise. */ bool clk_has_parent(const struct clk *clk, const struct clk *parent); /** * clk_set_rate_range - set a rate range for a clock source * @clk: clock source * @min: desired minimum clock rate in Hz, inclusive * @max: desired maximum clock rate in Hz, inclusive * * Returns success (0) or negative errno. */ int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max); /** * clk_set_min_rate - set a minimum clock rate for a clock source * @clk: clock source * @rate: desired minimum clock rate in Hz, inclusive * * Returns success (0) or negative errno. */ int clk_set_min_rate(struct clk *clk, unsigned long rate); /** * clk_set_max_rate - set a maximum clock rate for a clock source * @clk: clock source * @rate: desired maximum clock rate in Hz, inclusive * * Returns success (0) or negative errno. */ int clk_set_max_rate(struct clk *clk, unsigned long rate); /** * clk_set_parent - set the parent clock source for this clock * @clk: clock source * @parent: parent clock source * * Returns success (0) or negative errno. */ int clk_set_parent(struct clk *clk, struct clk *parent); /** * clk_get_parent - get the parent clock source for this clock * @clk: clock source * * Returns struct clk corresponding to parent clock source, or * valid IS_ERR() condition containing errno. */ struct clk *clk_get_parent(struct clk *clk); /** * clk_get_sys - get a clock based upon the device name * @dev_id: device name * @con_id: connection ID * * Returns a struct clk corresponding to the clock producer, or * valid IS_ERR() condition containing errno. The implementation * uses @dev_id and @con_id to determine the clock consumer, and * thereby the clock producer. In contrast to clk_get() this function * takes the device name instead of the device itself for identification. * * Drivers must assume that the clock source is not enabled. * * clk_get_sys should not be called from within interrupt context. */ struct clk *clk_get_sys(const char *dev_id, const char *con_id); /** * clk_save_context - save clock context for poweroff * * Saves the context of the clock register for powerstates in which the * contents of the registers will be lost. Occurs deep within the suspend * code so locking is not necessary. */ int clk_save_context(void); /** * clk_restore_context - restore clock context after poweroff * * This occurs with all clocks enabled. Occurs deep within the resume code * so locking is not necessary. */ void clk_restore_context(void); #else /* !CONFIG_HAVE_CLK */ static inline struct clk *clk_get(struct device *dev, const char *id) { return NULL; } static inline int __must_check clk_bulk_get(struct device *dev, int num_clks, struct clk_bulk_data *clks) { return 0; } static inline int __must_check clk_bulk_get_optional(struct device *dev, int num_clks, struct clk_bulk_data *clks) { return 0; } static inline int __must_check clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks) { return 0; } static inline struct clk *devm_clk_get(struct device *dev, const char *id) { return NULL; } static inline struct clk *devm_clk_get_prepared(struct device *dev, const char *id) { return NULL; } static inline struct clk *devm_clk_get_enabled(struct device *dev, const char *id) { return NULL; } static inline struct clk *devm_clk_get_optional(struct device *dev, const char *id) { return NULL; } static inline struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id) { return NULL; } static inline struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id) { return NULL; } static inline struct clk * devm_clk_get_optional_enabled_with_rate(struct device *dev, const char *id, unsigned long rate) { return NULL; } static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, struct clk_bulk_data *clks) { return 0; } static inline int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks, struct clk_bulk_data *clks) { return 0; } static inline int __must_check devm_clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks) { return 0; } static inline int __must_check devm_clk_bulk_get_all_enabled(struct device *dev, struct clk_bulk_data **clks) { return 0; } static inline struct clk *devm_get_clk_from_child(struct device *dev, struct device_node *np, const char *con_id) { return NULL; } static inline void clk_put(struct clk *clk) {} static inline void clk_bulk_put(int num_clks, struct clk_bulk_data *clks) {} static inline void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks) {} static inline void devm_clk_put(struct device *dev, struct clk *clk) {} static inline int clk_enable(struct clk *clk) { return 0; } static inline int __must_check clk_bulk_enable(int num_clks, const struct clk_bulk_data *clks) { return 0; } static inline void clk_disable(struct clk *clk) {} static inline void clk_bulk_disable(int num_clks, const struct clk_bulk_data *clks) {} static inline unsigned long clk_get_rate(struct clk *clk) { return 0; } static inline int clk_set_rate(struct clk *clk, unsigned long rate) { return 0; } static inline int clk_set_rate_exclusive(struct clk *clk, unsigned long rate) { return 0; } static inline long clk_round_rate(struct clk *clk, unsigned long rate) { return 0; } static inline bool clk_has_parent(struct clk *clk, struct clk *parent) { return true; } static inline int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max) { return 0; } static inline int clk_set_min_rate(struct clk *clk, unsigned long rate) { return 0; } static inline int clk_set_max_rate(struct clk *clk, unsigned long rate) { return 0; } static inline int clk_set_parent(struct clk *clk, struct clk *parent) { return 0; } static inline struct clk *clk_get_parent(struct clk *clk) { return NULL; } static inline struct clk *clk_get_sys(const char *dev_id, const char *con_id) { return NULL; } static inline int clk_save_context(void) { return 0; } static inline void clk_restore_context(void) {} #endif /* Deprecated. Use devm_clk_bulk_get_all_enabled() */ static inline int __must_check devm_clk_bulk_get_all_enable(struct device *dev, struct clk_bulk_data **clks) { int ret = devm_clk_bulk_get_all_enabled(dev, clks); return ret > 0 ? 0 : ret; } /* clk_prepare_enable helps cases using clk_enable in non-atomic context. */ static inline int clk_prepare_enable(struct clk *clk) { int ret; ret = clk_prepare(clk); if (ret) return ret; ret = clk_enable(clk); if (ret) clk_unprepare(clk); return ret; } /* clk_disable_unprepare helps cases using clk_disable in non-atomic context. */ static inline void clk_disable_unprepare(struct clk *clk) { clk_disable(clk); clk_unprepare(clk); } static inline int __must_check clk_bulk_prepare_enable(int num_clks, const struct clk_bulk_data *clks) { int ret; ret = clk_bulk_prepare(num_clks, clks); if (ret) return ret; ret = clk_bulk_enable(num_clks, clks); if (ret) clk_bulk_unprepare(num_clks, clks); return ret; } static inline void clk_bulk_disable_unprepare(int num_clks, const struct clk_bulk_data *clks) { clk_bulk_disable(num_clks, clks); clk_bulk_unprepare(num_clks, clks); } /** * clk_drop_range - Reset any range set on that clock * @clk: clock source * * Returns success (0) or negative errno. */ static inline int clk_drop_range(struct clk *clk) { return clk_set_rate_range(clk, 0, ULONG_MAX); } /** * clk_get_optional - lookup and obtain a reference to an optional clock * producer. * @dev: device for clock "consumer" * @id: clock consumer ID * * Behaves the same as clk_get() except where there is no clock producer. In * this case, instead of returning -ENOENT, the function returns NULL. */ static inline struct clk *clk_get_optional(struct device *dev, const char *id) { struct clk *clk = clk_get(dev, id); if (clk == ERR_PTR(-ENOENT)) return NULL; return clk; } #if defined(CONFIG_OF) && defined(CONFIG_COMMON_CLK) struct clk *of_clk_get(struct device_node *np, int index); struct clk *of_clk_get_by_name(struct device_node *np, const char *name); struct clk *of_clk_get_from_provider(struct of_phandle_args *clkspec); #else static inline struct clk *of_clk_get(struct device_node *np, int index) { return ERR_PTR(-ENOENT); } static inline struct clk *of_clk_get_by_name(struct device_node *np, const char *name) { return ERR_PTR(-ENOENT); } static inline struct clk *of_clk_get_from_provider(struct of_phandle_args *clkspec) { return ERR_PTR(-ENOENT); } #endif #endif
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H static inline unsigned blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) { return bl ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / sizeof(struct journal_seq_blacklist_entry)) : 0; } bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; bool bch2_blacklist_entries_gc(struct bch_fs *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
4 1 3 5 1 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <linux/uaccess.h> #include <drm/drm_atomic.h> #include <drm/drm_color_mgmt.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" /** * DOC: overview * * Color management or color space adjustments is supported through a set of 5 * properties on the &drm_crtc object. They are set up by calling * drm_crtc_enable_color_mgmt(). * * "DEGAMMA_LUT”: * Blob property to set the degamma lookup table (LUT) mapping pixel data * from the framebuffer before it is given to the transformation matrix. * The data is interpreted as an array of &struct drm_color_lut elements. * Hardware might choose not to use the full precision of the LUT elements * nor use all the elements of the LUT (for example the hardware might * choose to interpolate between LUT[0] and LUT[4]). * * Setting this to NULL (blob property value set to 0) means a * linear/pass-thru gamma table should be used. This is generally the * driver boot-up state too. Drivers can access this blob through * &drm_crtc_state.degamma_lut. * * “DEGAMMA_LUT_SIZE”: * Unsinged range property to give the size of the lookup table to be set * on the DEGAMMA_LUT property (the size depends on the underlying * hardware). If drivers support multiple LUT sizes then they should * publish the largest size, and sub-sample smaller sized LUTs (e.g. for * split-gamma modes) appropriately. * * “CTM”: * Blob property to set the current transformation matrix (CTM) apply to * pixel data after the lookup through the degamma LUT and before the * lookup through the gamma LUT. The data is interpreted as a struct * &drm_color_ctm. * * Setting this to NULL (blob property value set to 0) means a * unit/pass-thru matrix should be used. This is generally the driver * boot-up state too. Drivers can access the blob for the color conversion * matrix through &drm_crtc_state.ctm. * * “GAMMA_LUT”: * Blob property to set the gamma lookup table (LUT) mapping pixel data * after the transformation matrix to data sent to the connector. The * data is interpreted as an array of &struct drm_color_lut elements. * Hardware might choose not to use the full precision of the LUT elements * nor use all the elements of the LUT (for example the hardware might * choose to interpolate between LUT[0] and LUT[4]). * * Setting this to NULL (blob property value set to 0) means a * linear/pass-thru gamma table should be used. This is generally the * driver boot-up state too. Drivers can access this blob through * &drm_crtc_state.gamma_lut. * * Note that for mostly historical reasons stemming from Xorg heritage, * this is also used to store the color map (also sometimes color lut, CLUT * or color palette) for indexed formats like DRM_FORMAT_C8. * * “GAMMA_LUT_SIZE”: * Unsigned range property to give the size of the lookup table to be set * on the GAMMA_LUT property (the size depends on the underlying hardware). * If drivers support multiple LUT sizes then they should publish the * largest size, and sub-sample smaller sized LUTs (e.g. for split-gamma * modes) appropriately. * * There is also support for a legacy gamma table, which is set up by calling * drm_mode_crtc_set_gamma_size(). The DRM core will then alias the legacy gamma * ramp with "GAMMA_LUT" or, if that is unavailable, "DEGAMMA_LUT". * * Support for different non RGB color encodings is controlled through * &drm_plane specific COLOR_ENCODING and COLOR_RANGE properties. They * are set up by calling drm_plane_create_color_properties(). * * "COLOR_ENCODING": * Optional plane enum property to support different non RGB * color encodings. The driver can provide a subset of standard * enum values supported by the DRM plane. * * "COLOR_RANGE": * Optional plane enum property to support different non RGB * color parameter ranges. The driver can provide a subset of * standard enum values supported by the DRM plane. */ /** * drm_color_ctm_s31_32_to_qm_n * * @user_input: input value * @m: number of integer bits, only support m <= 32, include the sign-bit * @n: number of fractional bits, only support n <= 32 * * Convert and clamp S31.32 sign-magnitude to Qm.n (signed 2's complement). * The sign-bit BIT(m+n-1) and above are 0 for positive value and 1 for negative * the range of value is [-2^(m-1), 2^(m-1) - 2^-n] * * For example * A Q3.12 format number: * - required bit: 3 + 12 = 15bits * - range: [-2^2, 2^2 - 2^−15] * * NOTE: the m can be zero if all bit_precision are used to present fractional * bits like Q0.32 */ u64 drm_color_ctm_s31_32_to_qm_n(u64 user_input, u32 m, u32 n) { u64 mag = (user_input & ~BIT_ULL(63)) >> (32 - n); bool negative = !!(user_input & BIT_ULL(63)); s64 val; WARN_ON(m > 32 || n > 32); val = clamp_val(mag, 0, negative ? BIT_ULL(n + m - 1) : BIT_ULL(n + m - 1) - 1); return negative ? -val : val; } EXPORT_SYMBOL(drm_color_ctm_s31_32_to_qm_n); /** * drm_crtc_enable_color_mgmt - enable color management properties * @crtc: DRM CRTC * @degamma_lut_size: the size of the degamma lut (before CSC) * @has_ctm: whether to attach ctm_property for CSC matrix * @gamma_lut_size: the size of the gamma lut (after CSC) * * This function lets the driver enable the color correction * properties on a CRTC. This includes 3 degamma, csc and gamma * properties that userspace can set and 2 size properties to inform * the userspace of the lut sizes. Each of the properties are * optional. The gamma and degamma properties are only attached if * their size is not 0 and ctm_property is only attached if has_ctm is * true. */ void drm_crtc_enable_color_mgmt(struct drm_crtc *crtc, uint degamma_lut_size, bool has_ctm, uint gamma_lut_size) { struct drm_device *dev = crtc->dev; struct drm_mode_config *config = &dev->mode_config; if (degamma_lut_size) { drm_object_attach_property(&crtc->base, config->degamma_lut_property, 0); drm_object_attach_property(&crtc->base, config->degamma_lut_size_property, degamma_lut_size); } if (has_ctm) drm_object_attach_property(&crtc->base, config->ctm_property, 0); if (gamma_lut_size) { drm_object_attach_property(&crtc->base, config->gamma_lut_property, 0); drm_object_attach_property(&crtc->base, config->gamma_lut_size_property, gamma_lut_size); } } EXPORT_SYMBOL(drm_crtc_enable_color_mgmt); /** * drm_mode_crtc_set_gamma_size - set the gamma table size * @crtc: CRTC to set the gamma table size for * @gamma_size: size of the gamma table * * Drivers which support gamma tables should set this to the supported gamma * table size when initializing the CRTC. Currently the drm core only supports a * fixed gamma table size. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_crtc_set_gamma_size(struct drm_crtc *crtc, int gamma_size) { uint16_t *r_base, *g_base, *b_base; int i; crtc->gamma_size = gamma_size; crtc->gamma_store = kcalloc(gamma_size, sizeof(uint16_t) * 3, GFP_KERNEL); if (!crtc->gamma_store) { crtc->gamma_size = 0; return -ENOMEM; } r_base = crtc->gamma_store; g_base = r_base + gamma_size; b_base = g_base + gamma_size; for (i = 0; i < gamma_size; i++) { r_base[i] = i << 8; g_base[i] = i << 8; b_base[i] = i << 8; } return 0; } EXPORT_SYMBOL(drm_mode_crtc_set_gamma_size); /** * drm_crtc_supports_legacy_gamma - does the crtc support legacy gamma correction table * @crtc: CRTC object * * Returns true/false if the given crtc supports setting the legacy gamma * correction table. */ static bool drm_crtc_supports_legacy_gamma(struct drm_crtc *crtc) { u32 gamma_id = crtc->dev->mode_config.gamma_lut_property->base.id; u32 degamma_id = crtc->dev->mode_config.degamma_lut_property->base.id; if (!crtc->gamma_size) return false; if (crtc->funcs->gamma_set) return true; return !!(drm_mode_obj_find_prop_id(&crtc->base, gamma_id) || drm_mode_obj_find_prop_id(&crtc->base, degamma_id)); } /** * drm_crtc_legacy_gamma_set - set the legacy gamma correction table * @crtc: CRTC object * @red: red correction table * @green: green correction table * @blue: blue correction table * @size: size of the tables * @ctx: lock acquire context * * Implements support for legacy gamma correction table for drivers * that have set drm_crtc_funcs.gamma_set or that support color management * through the DEGAMMA_LUT/GAMMA_LUT properties. See * drm_crtc_enable_color_mgmt() and the containing chapter for * how the atomic color management and gamma tables work. * * This function sets the gamma using drm_crtc_funcs.gamma_set if set, or * alternatively using crtc color management properties. */ static int drm_crtc_legacy_gamma_set(struct drm_crtc *crtc, u16 *red, u16 *green, u16 *blue, u32 size, struct drm_modeset_acquire_ctx *ctx) { struct drm_device *dev = crtc->dev; struct drm_atomic_state *state; struct drm_crtc_state *crtc_state; struct drm_property_blob *blob; struct drm_color_lut *blob_data; u32 gamma_id = dev->mode_config.gamma_lut_property->base.id; u32 degamma_id = dev->mode_config.degamma_lut_property->base.id; bool use_gamma_lut; int i, ret = 0; bool replaced; if (crtc->funcs->gamma_set) return crtc->funcs->gamma_set(crtc, red, green, blue, size, ctx); if (drm_mode_obj_find_prop_id(&crtc->base, gamma_id)) use_gamma_lut = true; else if (drm_mode_obj_find_prop_id(&crtc->base, degamma_id)) use_gamma_lut = false; else return -ENODEV; state = drm_atomic_state_alloc(crtc->dev); if (!state) return -ENOMEM; blob = drm_property_create_blob(dev, sizeof(struct drm_color_lut) * size, NULL); if (IS_ERR(blob)) { ret = PTR_ERR(blob); blob = NULL; goto fail; } /* Prepare GAMMA_LUT with the legacy values. */ blob_data = blob->data; for (i = 0; i < size; i++) { blob_data[i].red = red[i]; blob_data[i].green = green[i]; blob_data[i].blue = blue[i]; } state->acquire_ctx = ctx; crtc_state = drm_atomic_get_crtc_state(state, crtc); if (IS_ERR(crtc_state)) { ret = PTR_ERR(crtc_state); goto fail; } /* Set GAMMA_LUT and reset DEGAMMA_LUT and CTM */ replaced = drm_property_replace_blob(&crtc_state->degamma_lut, use_gamma_lut ? NULL : blob); replaced |= drm_property_replace_blob(&crtc_state->ctm, NULL); replaced |= drm_property_replace_blob(&crtc_state->gamma_lut, use_gamma_lut ? blob : NULL); crtc_state->color_mgmt_changed |= replaced; ret = drm_atomic_commit(state); fail: drm_atomic_state_put(state); drm_property_blob_put(blob); return ret; } /** * drm_mode_gamma_set_ioctl - set the gamma table * @dev: DRM device * @data: ioctl data * @file_priv: DRM file info * * Set the gamma table of a CRTC to the one passed in by the user. Userspace can * inquire the required gamma table size through drm_mode_gamma_get_ioctl. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_gamma_set_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_crtc_lut *crtc_lut = data; struct drm_crtc *crtc; void *r_base, *g_base, *b_base; int size; struct drm_modeset_acquire_ctx ctx; int ret = 0; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; crtc = drm_crtc_find(dev, file_priv, crtc_lut->crtc_id); if (!crtc) return -ENOENT; if (!drm_crtc_supports_legacy_gamma(crtc)) return -ENOSYS; /* memcpy into gamma store */ if (crtc_lut->gamma_size != crtc->gamma_size) return -EINVAL; DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); size = crtc_lut->gamma_size * (sizeof(uint16_t)); r_base = crtc->gamma_store; if (copy_from_user(r_base, (void __user *)(unsigned long)crtc_lut->red, size)) { ret = -EFAULT; goto out; } g_base = r_base + size; if (copy_from_user(g_base, (void __user *)(unsigned long)crtc_lut->green, size)) { ret = -EFAULT; goto out; } b_base = g_base + size; if (copy_from_user(b_base, (void __user *)(unsigned long)crtc_lut->blue, size)) { ret = -EFAULT; goto out; } ret = drm_crtc_legacy_gamma_set(crtc, r_base, g_base, b_base, crtc->gamma_size, &ctx); out: DRM_MODESET_LOCK_ALL_END(dev, ctx, ret); return ret; } /** * drm_mode_gamma_get_ioctl - get the gamma table * @dev: DRM device * @data: ioctl data * @file_priv: DRM file info * * Copy the current gamma table into the storage provided. This also provides * the gamma table size the driver expects, which can be used to size the * allocated storage. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_gamma_get_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_crtc_lut *crtc_lut = data; struct drm_crtc *crtc; void *r_base, *g_base, *b_base; int size; int ret = 0; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; crtc = drm_crtc_find(dev, file_priv, crtc_lut->crtc_id); if (!crtc) return -ENOENT; /* memcpy into gamma store */ if (crtc_lut->gamma_size != crtc->gamma_size) return -EINVAL; drm_modeset_lock(&crtc->mutex, NULL); size = crtc_lut->gamma_size * (sizeof(uint16_t)); r_base = crtc->gamma_store; if (copy_to_user((void __user *)(unsigned long)crtc_lut->red, r_base, size)) { ret = -EFAULT; goto out; } g_base = r_base + size; if (copy_to_user((void __user *)(unsigned long)crtc_lut->green, g_base, size)) { ret = -EFAULT; goto out; } b_base = g_base + size; if (copy_to_user((void __user *)(unsigned long)crtc_lut->blue, b_base, size)) { ret = -EFAULT; goto out; } out: drm_modeset_unlock(&crtc->mutex); return ret; } static const char * const color_encoding_name[] = { [DRM_COLOR_YCBCR_BT601] = "ITU-R BT.601 YCbCr", [DRM_COLOR_YCBCR_BT709] = "ITU-R BT.709 YCbCr", [DRM_COLOR_YCBCR_BT2020] = "ITU-R BT.2020 YCbCr", }; static const char * const color_range_name[] = { [DRM_COLOR_YCBCR_FULL_RANGE] = "YCbCr full range", [DRM_COLOR_YCBCR_LIMITED_RANGE] = "YCbCr limited range", }; /** * drm_get_color_encoding_name - return a string for color encoding * @encoding: color encoding to compute name of * * In contrast to the other drm_get_*_name functions this one here returns a * const pointer and hence is threadsafe. */ const char *drm_get_color_encoding_name(enum drm_color_encoding encoding) { if (WARN_ON(encoding >= ARRAY_SIZE(color_encoding_name))) return "unknown"; return color_encoding_name[encoding]; } /** * drm_get_color_range_name - return a string for color range * @range: color range to compute name of * * In contrast to the other drm_get_*_name functions this one here returns a * const pointer and hence is threadsafe. */ const char *drm_get_color_range_name(enum drm_color_range range) { if (WARN_ON(range >= ARRAY_SIZE(color_range_name))) return "unknown"; return color_range_name[range]; } /** * drm_plane_create_color_properties - color encoding related plane properties * @plane: plane object * @supported_encodings: bitfield indicating supported color encodings * @supported_ranges: bitfileld indicating supported color ranges * @default_encoding: default color encoding * @default_range: default color range * * Create and attach plane specific COLOR_ENCODING and COLOR_RANGE * properties to @plane. The supported encodings and ranges should * be provided in supported_encodings and supported_ranges bitmasks. * Each bit set in the bitmask indicates that its number as enum * value is supported. */ int drm_plane_create_color_properties(struct drm_plane *plane, u32 supported_encodings, u32 supported_ranges, enum drm_color_encoding default_encoding, enum drm_color_range default_range) { struct drm_device *dev = plane->dev; struct drm_property *prop; struct drm_prop_enum_list enum_list[MAX_T(int, DRM_COLOR_ENCODING_MAX, DRM_COLOR_RANGE_MAX)]; int i, len; if (WARN_ON(supported_encodings == 0 || (supported_encodings & -BIT(DRM_COLOR_ENCODING_MAX)) != 0 || (supported_encodings & BIT(default_encoding)) == 0)) return -EINVAL; if (WARN_ON(supported_ranges == 0 || (supported_ranges & -BIT(DRM_COLOR_RANGE_MAX)) != 0 || (supported_ranges & BIT(default_range)) == 0)) return -EINVAL; len = 0; for (i = 0; i < DRM_COLOR_ENCODING_MAX; i++) { if ((supported_encodings & BIT(i)) == 0) continue; enum_list[len].type = i; enum_list[len].name = color_encoding_name[i]; len++; } prop = drm_property_create_enum(dev, 0, "COLOR_ENCODING", enum_list, len); if (!prop) return -ENOMEM; plane->color_encoding_property = prop; drm_object_attach_property(&plane->base, prop, default_encoding); if (plane->state) plane->state->color_encoding = default_encoding; len = 0; for (i = 0; i < DRM_COLOR_RANGE_MAX; i++) { if ((supported_ranges & BIT(i)) == 0) continue; enum_list[len].type = i; enum_list[len].name = color_range_name[i]; len++; } prop = drm_property_create_enum(dev, 0, "COLOR_RANGE", enum_list, len); if (!prop) return -ENOMEM; plane->color_range_property = prop; drm_object_attach_property(&plane->base, prop, default_range); if (plane->state) plane->state->color_range = default_range; return 0; } EXPORT_SYMBOL(drm_plane_create_color_properties); /** * drm_color_lut_check - check validity of lookup table * @lut: property blob containing LUT to check * @tests: bitmask of tests to run * * Helper to check whether a userspace-provided lookup table is valid and * satisfies hardware requirements. Drivers pass a bitmask indicating which of * the tests in &drm_color_lut_tests should be performed. * * Returns 0 on success, -EINVAL on failure. */ int drm_color_lut_check(const struct drm_property_blob *lut, u32 tests) { const struct drm_color_lut *entry; int i; if (!lut || !tests) return 0; entry = lut->data; for (i = 0; i < drm_color_lut_size(lut); i++) { if (tests & DRM_COLOR_LUT_EQUAL_CHANNELS) { if (entry[i].red != entry[i].blue || entry[i].red != entry[i].green) { DRM_DEBUG_KMS("All LUT entries must have equal r/g/b\n"); return -EINVAL; } } if (i > 0 && tests & DRM_COLOR_LUT_NON_DECREASING) { if (entry[i].red < entry[i - 1].red || entry[i].green < entry[i - 1].green || entry[i].blue < entry[i - 1].blue) { DRM_DEBUG_KMS("LUT entries must never decrease.\n"); return -EINVAL; } } } return 0; } EXPORT_SYMBOL(drm_color_lut_check);
18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BUF_H__ #define __XFS_BUF_H__ #include <linux/list.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/dax.h> #include <linux/uio.h> #include <linux/list_lru.h> extern struct kmem_cache *xfs_buf_cache; /* * Base types */ struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) #define XBF_READ (1u << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ #define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ #define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ #define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ /* buffer type flags for write callbacks */ #define _XBF_INODES (1u << 16)/* inode buffer */ #define _XBF_DQUOTS (1u << 17)/* dquot buffer */ #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ #define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1u << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ /* * Online fsck is scanning the buffer cache for live buffers. Do not warn * about length mismatches during lookups and do not return stale buffers. */ #define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ #define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_INODES, "INODES" }, \ { _XBF_DQUOTS, "DQUOTS" }, \ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" }, \ { XBF_UNMAPPED, "UNMAPPED" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { spinlock_t bc_lock; struct rhashtable bc_hash; }; int xfs_buf_cache_init(struct xfs_buf_cache *bch); void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); /* * The xfs_buftarg contains 2 notions of "sector size" - * * 1) The metadata sector size, which is the minimum unit and * alignment of IO which will be performed by metadata operations. * 2) The device logical sector size * * The first is specified at mkfs time, and is stored on-disk in the * superblock's sb_sectsize. * * The latter is derived from the underlying device, and controls direct IO * alignment constraints. */ struct xfs_buftarg { dev_t bt_dev; struct file *bt_bdev_file; struct block_device *bt_bdev; struct dax_device *bt_daxdev; struct file *bt_file; u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; /* LRU control structures */ struct shrinker *bt_shrinker; struct list_lru bt_lru; struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; /* Atomic write unit values */ unsigned int bt_bdev_awu_min; unsigned int bt_bdev_awu_max; /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; #define XB_PAGES 2 struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ unsigned int bm_flags; }; /* * Online fsck is scanning the buffer cache for live buffers. Do not warn * about length mismatches during lookups and do not return stale buffers. */ #define XBM_LIVESCAN (1U << 0) #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; struct xfs_buf_ops { char *name; union { __be32 magic[2]; /* v4 and v5 on disk magic values */ __be16 magic16[2]; /* v4 and v5 on disk magic values */ }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); }; struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache * hit to be fully processed. The semaphore straddles the cacheline * boundary, but the counter and lock sits on the first cacheline, * which is the only bit that is touched if we hit the semaphore * fast-path on locking. */ struct rhash_head b_rhash_head; /* pag buffer hash node */ xfs_daddr_t b_rhash_key; /* buffer cache index */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ /* * concurrent access to b_lru and b_lru_flags are protected by * bt_lru_lock and not by b_sema */ struct list_head b_lru; /* lru list */ spinlock_t b_lock; /* internal state lock */ unsigned int b_state; /* internal state flags */ int b_io_error; /* internal IO error state */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ struct xfs_mount *b_mount; struct xfs_buftarg *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; struct completion b_iowait; /* queue for I/O waiters */ struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset of b_addr, only for _XBF_KMEM buffers */ int b_error; /* error code on I/O */ /* * async write failure retry count. Initialised to zero on the first * failure, then when it exceeds the maximum configured without a * success the write is considered to be failed permanently and the * iodone handler will take appropriate action. * * For retry timeouts, we record the jiffy of the first failure. This * means that we can change the retry timeout for buffers already under * I/O and thus avoid getting stuck in a retry loop with a long timeout. * * last_error is used to ensure that we are getting repeated errors, not * different errors. e.g. a block device might change ENOSPC to EIO when * a failure timeout occurs, so we want to re-initialise the error * retry behaviour appropriately when that happens. */ int b_retries; unsigned long b_first_retry_time; /* in jiffies */ int b_last_error; const struct xfs_buf_ops *b_ops; struct rcu_head b_rcu; }; /* Finding and Reading Buffers */ int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops, xfs_failaddr_t fa); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, const struct xfs_buf_ops *ops); static inline int xfs_buf_incore( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp); } static inline int xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_get_map(target, &map, 1, 0, bpp); } static inline int xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_read_map(target, &map, 1, flags, bpp, ops, __builtin_return_address(0)); } static inline void xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_readahead_map(target, &map, 1, ops); } int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ extern void xfs_buf_rele(struct xfs_buf *); /* Locking and Unlocking Buffers */ extern int xfs_buf_trylock(struct xfs_buf *); extern void xfs_buf_lock(struct xfs_buf *); extern void xfs_buf_unlock(struct xfs_buf *); #define xfs_buf_islocked(bp) \ ((bp)->b_sema.count <= 0) static inline void xfs_buf_relse(struct xfs_buf *bp) { xfs_buf_unlock(bp); xfs_buf_rele(bp); } /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { return bp->b_maps[0].bm_bn; } void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); /* * If the buffer is already on the LRU, do nothing. Otherwise set the buffer * up with a reference count of 0 so it will be tossed from the cache when * released. */ static inline void xfs_buf_oneshot(struct xfs_buf *bp) { if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1) return; atomic_set(&bp->b_lru_ref, 0); } static inline int xfs_buf_ispinned(struct xfs_buf *bp) { return atomic_read(&bp->b_pin_count); } static inline int xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) { return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), cksum_offset); } static inline void xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) { xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), cksum_offset); } /* * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, struct file *bdev_file); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); /* for xfs_buf_mem.c only: */ int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize, const char *descr); void xfs_destroy_buftarg(struct xfs_buftarg *btp); #endif /* __XFS_BUF_H__ */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/addr.h * * Various routines for copying and comparing sockaddrs and for * converting them to and from presentation format. */ #ifndef _LINUX_SUNRPC_ADDR_H #define _LINUX_SUNRPC_ADDR_H #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> #include <net/ipv6.h> size_t rpc_ntop(const struct sockaddr *, char *, const size_t); size_t rpc_pton(struct net *, const char *, const size_t, struct sockaddr *, const size_t); char * rpc_sockaddr2uaddr(const struct sockaddr *, gfp_t); size_t rpc_uaddr2sockaddr(struct net *, const char *, const size_t, struct sockaddr *, const size_t); static inline unsigned short rpc_get_port(const struct sockaddr *sap) { switch (sap->sa_family) { case AF_INET: return ntohs(((struct sockaddr_in *)sap)->sin_port); case AF_INET6: return ntohs(((struct sockaddr_in6 *)sap)->sin6_port); } return 0; } static inline void rpc_set_port(struct sockaddr *sap, const unsigned short port) { switch (sap->sa_family) { case AF_INET: ((struct sockaddr_in *)sap)->sin_port = htons(port); break; case AF_INET6: ((struct sockaddr_in6 *)sap)->sin6_port = htons(port); break; } } #define IPV6_SCOPE_DELIMITER '%' #define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") static inline bool rpc_cmp_addr4(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } static inline bool __rpc_copy_addr4(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in *ssin = (struct sockaddr_in *) src; struct sockaddr_in *dsin = (struct sockaddr_in *) dst; dsin->sin_family = ssin->sin_family; dsin->sin_addr.s_addr = ssin->sin_addr.s_addr; return true; } #if IS_ENABLED(CONFIG_IPV6) static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return false; else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) return sin1->sin6_scope_id == sin2->sin6_scope_id; return true; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src; struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst; dsin6->sin6_family = ssin6->sin6_family; dsin6->sin6_addr = ssin6->sin6_addr; dsin6->sin6_scope_id = ssin6->sin6_scope_id; return true; } #else /* !(IS_ENABLED(CONFIG_IPV6) */ static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { return false; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { return false; } #endif /* !(IS_ENABLED(CONFIG_IPV6) */ /** * rpc_cmp_addr - compare the address portion of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr * * Just compares the family and address portion. Ignores port, but * compares the scope if it's a link-local address. * * Returns true if the addrs are equal, false if they aren't. */ static inline bool rpc_cmp_addr(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (sap1->sa_family == sap2->sa_family) { switch (sap1->sa_family) { case AF_INET: return rpc_cmp_addr4(sap1, sap2); case AF_INET6: return rpc_cmp_addr6(sap1, sap2); } } return false; } /** * rpc_cmp_addr_port - compare the address and port number of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr */ static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (!rpc_cmp_addr(sap1, sap2)) return false; return rpc_get_port(sap1) == rpc_get_port(sap2); } /** * rpc_copy_addr - copy the address portion of one sockaddr to another * @dst: destination sockaddr * @src: source sockaddr * * Just copies the address portion and family. Ignores port, scope, etc. * Caller is responsible for making certain that dst is large enough to hold * the address in src. Returns true if address family is supported. Returns * false otherwise. */ static inline bool rpc_copy_addr(struct sockaddr *dst, const struct sockaddr *src) { switch (src->sa_family) { case AF_INET: return __rpc_copy_addr4(dst, src); case AF_INET6: return __rpc_copy_addr6(dst, src); } return false; } /** * rpc_get_scope_id - return scopeid for a given sockaddr * @sa: sockaddr to get scopeid from * * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if * not an AF_INET6 address. */ static inline u32 rpc_get_scope_id(const struct sockaddr *sa) { if (sa->sa_family != AF_INET6) return 0; return ((struct sockaddr_in6 *) sa)->sin6_scope_id; } #endif /* _LINUX_SUNRPC_ADDR_H */
111 110 111 5 1 108 5 6 106 88 20 19 87 130 9 128 3 1 32 30 2 1 58 57 10 10 39 7 6 45 44 1 19 18 1 412 316 92 47 199 154 7 81 150 5 21 9 42 2 97 81 326 110 5 104 339 517 515 1 1 30 30 41 1 57 56 68 1 28 6 33 4 38 1 19 19 1 1 59 2 328 37 5 437 8 45 444 32 471 513 121 36 31 3 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; break; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } /* If get_ucounts fails more bits are needed in the refcount */ if (unlikely(!get_ucounts(old->ucounts))) { WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
123 1 1 72 5 1 26 22 19 4 3 19 1 2 1 20 5 4 4 4 2 2 2 2 57 56 7 50 51 5 32 3 23 6 36 27 2 2 10 21 5 50 50 50 26 21 3 2 2 2 44 48 25 2 23 31 31 2 3 26 24 1 2 23 29 29 3 5 1 4 4 1 9 6 2 1 2 1 1 3 3 1 2 27 4 23 20 6 8 5 3 2 4 1 7 7 4 3 3 5 5 5 5 2 5 4 4 2 1 2 4 1 3 1 2 3 2 2 2 4 12 12 12 4 8 3 5 1 1 3 5 5 5 5 4 1 1 2 5 1 6 3 1 2 4 2 1 1 7 7 6 6 6 3 2 1 1 5 3 1 2 3 1 2 6 11 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 // SPDX-License-Identifier: GPL-2.0-only /* * A virtual codec example device. * * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * * This is a virtual codec device driver for testing the codec framework. * It simulates a device that uses memory buffers for both source and * destination and encodes or decodes the data. */ #include <linux/module.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <media/v4l2-mem2mem.h> #include <media/v4l2-device.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/videobuf2-vmalloc.h> #include "codec-v4l2-fwht.h" MODULE_DESCRIPTION("Virtual codec device"); MODULE_AUTHOR("Hans Verkuil <hansverk@cisco.com>"); MODULE_LICENSE("GPL v2"); static bool multiplanar; module_param(multiplanar, bool, 0444); MODULE_PARM_DESC(multiplanar, " use multi-planar API instead of single-planar API"); static unsigned int debug; module_param(debug, uint, 0644); MODULE_PARM_DESC(debug, " activates debug info"); #define VICODEC_NAME "vicodec" #define MAX_WIDTH 4096U #define MIN_WIDTH 640U #define MAX_HEIGHT 2160U #define MIN_HEIGHT 360U /* Recommended number of buffers for the stateful codecs */ #define VICODEC_REC_BUFS 2 #define dprintk(dev, fmt, arg...) \ v4l2_dbg(1, debug, &dev->v4l2_dev, "%s: " fmt, __func__, ## arg) struct pixfmt_info { u32 id; unsigned int bytesperline_mult; unsigned int sizeimage_mult; unsigned int sizeimage_div; unsigned int luma_step; unsigned int chroma_step; /* Chroma plane subsampling */ unsigned int width_div; unsigned int height_div; }; static const struct v4l2_fwht_pixfmt_info pixfmt_fwht = { V4L2_PIX_FMT_FWHT, 0, 3, 1, 1, 1, 1, 1, 0, 1 }; static const struct v4l2_fwht_pixfmt_info pixfmt_stateless_fwht = { V4L2_PIX_FMT_FWHT_STATELESS, 0, 3, 1, 1, 1, 1, 1, 0, 1 }; static void vicodec_dev_release(struct device *dev) { } static struct platform_device vicodec_pdev = { .name = VICODEC_NAME, .dev.release = vicodec_dev_release, }; /* Per-queue, driver-specific private data */ struct vicodec_q_data { unsigned int coded_width; unsigned int coded_height; unsigned int visible_width; unsigned int visible_height; unsigned int sizeimage; unsigned int vb2_sizeimage; unsigned int sequence; const struct v4l2_fwht_pixfmt_info *info; }; enum { V4L2_M2M_SRC = 0, V4L2_M2M_DST = 1, }; struct vicodec_dev_instance { struct video_device vfd; struct mutex mutex; spinlock_t lock; struct v4l2_m2m_dev *m2m_dev; }; struct vicodec_dev { struct v4l2_device v4l2_dev; struct vicodec_dev_instance stateful_enc; struct vicodec_dev_instance stateful_dec; struct vicodec_dev_instance stateless_dec; #ifdef CONFIG_MEDIA_CONTROLLER struct media_device mdev; #endif }; struct vicodec_ctx { struct v4l2_fh fh; struct vicodec_dev *dev; bool is_enc; bool is_stateless; spinlock_t *lock; struct v4l2_ctrl_handler hdl; /* Source and destination queue data */ struct vicodec_q_data q_data[2]; struct v4l2_fwht_state state; u32 cur_buf_offset; u32 comp_max_size; u32 comp_size; u32 header_size; u32 comp_magic_cnt; bool comp_has_frame; bool comp_has_next_frame; bool first_source_change_sent; bool source_changed; }; static const struct v4l2_event vicodec_eos_event = { .type = V4L2_EVENT_EOS }; static inline struct vicodec_ctx *file2ctx(struct file *file) { return container_of(file->private_data, struct vicodec_ctx, fh); } static struct vicodec_q_data *get_q_data(struct vicodec_ctx *ctx, enum v4l2_buf_type type) { switch (type) { case V4L2_BUF_TYPE_VIDEO_OUTPUT: case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE: return &ctx->q_data[V4L2_M2M_SRC]; case V4L2_BUF_TYPE_VIDEO_CAPTURE: case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE: return &ctx->q_data[V4L2_M2M_DST]; default: break; } return NULL; } static void copy_cap_to_ref(const u8 *cap, const struct v4l2_fwht_pixfmt_info *info, struct v4l2_fwht_state *state) { int plane_idx; u8 *p_ref = state->ref_frame.buf; unsigned int cap_stride = state->stride; unsigned int ref_stride = state->ref_stride; for (plane_idx = 0; plane_idx < info->planes_num; plane_idx++) { int i; unsigned int h_div = (plane_idx == 1 || plane_idx == 2) ? info->height_div : 1; const u8 *row_cap = cap; u8 *row_ref = p_ref; if (info->planes_num == 3 && plane_idx == 1) { cap_stride /= 2; ref_stride /= 2; } if (plane_idx == 1 && (info->id == V4L2_PIX_FMT_NV24 || info->id == V4L2_PIX_FMT_NV42)) { cap_stride *= 2; ref_stride *= 2; } for (i = 0; i < state->visible_height / h_div; i++) { memcpy(row_ref, row_cap, ref_stride); row_ref += ref_stride; row_cap += cap_stride; } cap += cap_stride * (state->coded_height / h_div); p_ref += ref_stride * (state->coded_height / h_div); } } static bool validate_by_version(unsigned int flags, unsigned int version) { if (!version || version > V4L2_FWHT_VERSION) return false; if (version >= 2) { unsigned int components_num = 1 + ((flags & V4L2_FWHT_FL_COMPONENTS_NUM_MSK) >> V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET); unsigned int pixenc = flags & V4L2_FWHT_FL_PIXENC_MSK; if (components_num == 0 || components_num > 4 || !pixenc) return false; } return true; } static bool validate_stateless_params_flags(const struct v4l2_ctrl_fwht_params *params, const struct v4l2_fwht_pixfmt_info *cur_info) { unsigned int width_div = (params->flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH) ? 1 : 2; unsigned int height_div = (params->flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT) ? 1 : 2; unsigned int components_num = 3; unsigned int pixenc = 0; if (params->version < 3) return false; components_num = 1 + ((params->flags & V4L2_FWHT_FL_COMPONENTS_NUM_MSK) >> V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET); pixenc = (params->flags & V4L2_FWHT_FL_PIXENC_MSK); if (v4l2_fwht_validate_fmt(cur_info, width_div, height_div, components_num, pixenc)) return true; return false; } static void update_state_from_header(struct vicodec_ctx *ctx) { const struct fwht_cframe_hdr *p_hdr = &ctx->state.header; ctx->state.visible_width = ntohl(p_hdr->width); ctx->state.visible_height = ntohl(p_hdr->height); ctx->state.colorspace = ntohl(p_hdr->colorspace); ctx->state.xfer_func = ntohl(p_hdr->xfer_func); ctx->state.ycbcr_enc = ntohl(p_hdr->ycbcr_enc); ctx->state.quantization = ntohl(p_hdr->quantization); } static int device_process(struct vicodec_ctx *ctx, struct vb2_v4l2_buffer *src_vb, struct vb2_v4l2_buffer *dst_vb) { struct vicodec_dev *dev = ctx->dev; struct v4l2_fwht_state *state = &ctx->state; u8 *p_src, *p_dst; int ret = 0; if (ctx->is_enc || ctx->is_stateless) p_src = vb2_plane_vaddr(&src_vb->vb2_buf, 0); else p_src = state->compressed_frame; if (ctx->is_stateless) { struct media_request *src_req = src_vb->vb2_buf.req_obj.req; ret = v4l2_ctrl_request_setup(src_req, &ctx->hdl); if (ret) return ret; update_state_from_header(ctx); ctx->state.header.size = htonl(vb2_get_plane_payload(&src_vb->vb2_buf, 0)); /* * set the reference buffer from the reference timestamp * only if this is a P-frame */ if (!(ntohl(ctx->state.header.flags) & V4L2_FWHT_FL_I_FRAME)) { struct vb2_buffer *ref_vb2_buf; struct vb2_queue *vq_cap = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); ref_vb2_buf = vb2_find_buffer(vq_cap, ctx->state.ref_frame_ts); if (!ref_vb2_buf) return -EINVAL; if (ref_vb2_buf->state == VB2_BUF_STATE_ERROR) ret = -EINVAL; ctx->state.ref_frame.buf = vb2_plane_vaddr(ref_vb2_buf, 0); } else { ctx->state.ref_frame.buf = NULL; } } p_dst = vb2_plane_vaddr(&dst_vb->vb2_buf, 0); if (!p_src || !p_dst) { v4l2_err(&dev->v4l2_dev, "Acquiring kernel pointers to buffers failed\n"); return -EFAULT; } if (ctx->is_enc) { struct vicodec_q_data *q_src; int comp_sz_or_errcode; q_src = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT); state->info = q_src->info; comp_sz_or_errcode = v4l2_fwht_encode(state, p_src, p_dst); if (comp_sz_or_errcode < 0) return comp_sz_or_errcode; vb2_set_plane_payload(&dst_vb->vb2_buf, 0, comp_sz_or_errcode); } else { struct vicodec_q_data *q_dst; unsigned int comp_frame_size = ntohl(ctx->state.header.size); q_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); if (comp_frame_size > ctx->comp_max_size) return -EINVAL; state->info = q_dst->info; ret = v4l2_fwht_decode(state, p_src, p_dst); if (ret < 0) return ret; if (!ctx->is_stateless) copy_cap_to_ref(p_dst, ctx->state.info, &ctx->state); vb2_set_plane_payload(&dst_vb->vb2_buf, 0, q_dst->sizeimage); if (ntohl(ctx->state.header.flags) & V4L2_FWHT_FL_I_FRAME) dst_vb->flags |= V4L2_BUF_FLAG_KEYFRAME; else dst_vb->flags |= V4L2_BUF_FLAG_PFRAME; } return ret; } /* * mem2mem callbacks */ static enum vb2_buffer_state get_next_header(struct vicodec_ctx *ctx, u8 **pp, u32 sz) { static const u8 magic[] = { 0x4f, 0x4f, 0x4f, 0x4f, 0xff, 0xff, 0xff, 0xff }; u8 *p = *pp; u32 state; u8 *header = (u8 *)&ctx->state.header; state = VB2_BUF_STATE_DONE; if (!ctx->header_size) { state = VB2_BUF_STATE_ERROR; for (; p < *pp + sz; p++) { u32 copy; p = memchr(p, magic[ctx->comp_magic_cnt], *pp + sz - p); if (!p) { ctx->comp_magic_cnt = 0; p = *pp + sz; break; } copy = sizeof(magic) - ctx->comp_magic_cnt; if (*pp + sz - p < copy) copy = *pp + sz - p; memcpy(header + ctx->comp_magic_cnt, p, copy); ctx->comp_magic_cnt += copy; if (!memcmp(header, magic, ctx->comp_magic_cnt)) { p += copy; state = VB2_BUF_STATE_DONE; break; } ctx->comp_magic_cnt = 0; } if (ctx->comp_magic_cnt < sizeof(magic)) { *pp = p; return state; } ctx->header_size = sizeof(magic); } if (ctx->header_size < sizeof(struct fwht_cframe_hdr)) { u32 copy = sizeof(struct fwht_cframe_hdr) - ctx->header_size; if (*pp + sz - p < copy) copy = *pp + sz - p; memcpy(header + ctx->header_size, p, copy); p += copy; ctx->header_size += copy; } *pp = p; return state; } /* device_run() - prepares and starts the device */ static void device_run(void *priv) { struct vicodec_ctx *ctx = priv; struct vicodec_dev *dev = ctx->dev; struct vb2_v4l2_buffer *src_buf, *dst_buf; struct vicodec_q_data *q_src, *q_dst; u32 state; struct media_request *src_req; src_buf = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx); dst_buf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx); src_req = src_buf->vb2_buf.req_obj.req; q_src = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT); q_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); state = VB2_BUF_STATE_DONE; if (device_process(ctx, src_buf, dst_buf)) state = VB2_BUF_STATE_ERROR; else dst_buf->sequence = q_dst->sequence++; dst_buf->flags &= ~V4L2_BUF_FLAG_LAST; v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, false); spin_lock(ctx->lock); if (!ctx->comp_has_next_frame && v4l2_m2m_is_last_draining_src_buf(ctx->fh.m2m_ctx, src_buf)) { dst_buf->flags |= V4L2_BUF_FLAG_LAST; v4l2_event_queue_fh(&ctx->fh, &vicodec_eos_event); v4l2_m2m_mark_stopped(ctx->fh.m2m_ctx); } if (ctx->is_enc || ctx->is_stateless) { src_buf->sequence = q_src->sequence++; src_buf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx); v4l2_m2m_buf_done(src_buf, state); } else if (vb2_get_plane_payload(&src_buf->vb2_buf, 0) == ctx->cur_buf_offset) { src_buf->sequence = q_src->sequence++; src_buf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx); v4l2_m2m_buf_done(src_buf, state); ctx->cur_buf_offset = 0; ctx->comp_has_next_frame = false; } v4l2_m2m_buf_done(dst_buf, state); ctx->comp_size = 0; ctx->header_size = 0; ctx->comp_magic_cnt = 0; ctx->comp_has_frame = false; spin_unlock(ctx->lock); if (ctx->is_stateless && src_req) v4l2_ctrl_request_complete(src_req, &ctx->hdl); if (ctx->is_enc) v4l2_m2m_job_finish(dev->stateful_enc.m2m_dev, ctx->fh.m2m_ctx); else if (ctx->is_stateless) v4l2_m2m_job_finish(dev->stateless_dec.m2m_dev, ctx->fh.m2m_ctx); else v4l2_m2m_job_finish(dev->stateful_dec.m2m_dev, ctx->fh.m2m_ctx); } static void job_remove_src_buf(struct vicodec_ctx *ctx, u32 state) { struct vb2_v4l2_buffer *src_buf; struct vicodec_q_data *q_src; q_src = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT); spin_lock(ctx->lock); src_buf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx); src_buf->sequence = q_src->sequence++; v4l2_m2m_buf_done(src_buf, state); ctx->cur_buf_offset = 0; spin_unlock(ctx->lock); } static const struct v4l2_fwht_pixfmt_info * info_from_header(const struct fwht_cframe_hdr *p_hdr) { unsigned int flags = ntohl(p_hdr->flags); unsigned int width_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH) ? 1 : 2; unsigned int height_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT) ? 1 : 2; unsigned int components_num = 3; unsigned int pixenc = 0; unsigned int version = ntohl(p_hdr->version); if (version >= 2) { components_num = 1 + ((flags & V4L2_FWHT_FL_COMPONENTS_NUM_MSK) >> V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET); pixenc = (flags & V4L2_FWHT_FL_PIXENC_MSK); } return v4l2_fwht_find_nth_fmt(width_div, height_div, components_num, pixenc, 0); } static bool is_header_valid(const struct fwht_cframe_hdr *p_hdr) { const struct v4l2_fwht_pixfmt_info *info; unsigned int w = ntohl(p_hdr->width); unsigned int h = ntohl(p_hdr->height); unsigned int version = ntohl(p_hdr->version); unsigned int flags = ntohl(p_hdr->flags); if (w < MIN_WIDTH || w > MAX_WIDTH || h < MIN_HEIGHT || h > MAX_HEIGHT) return false; if (!validate_by_version(flags, version)) return false; info = info_from_header(p_hdr); if (!info) return false; return true; } static void update_capture_data_from_header(struct vicodec_ctx *ctx) { struct vicodec_q_data *q_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); const struct fwht_cframe_hdr *p_hdr = &ctx->state.header; const struct v4l2_fwht_pixfmt_info *info = info_from_header(p_hdr); unsigned int flags = ntohl(p_hdr->flags); unsigned int hdr_width_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH) ? 1 : 2; unsigned int hdr_height_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT) ? 1 : 2; /* * This function should not be used by a stateless codec since * it changes values in q_data that are not request specific */ WARN_ON(ctx->is_stateless); q_dst->info = info; q_dst->visible_width = ntohl(p_hdr->width); q_dst->visible_height = ntohl(p_hdr->height); q_dst->coded_width = vic_round_dim(q_dst->visible_width, hdr_width_div); q_dst->coded_height = vic_round_dim(q_dst->visible_height, hdr_height_div); q_dst->sizeimage = q_dst->coded_width * q_dst->coded_height * q_dst->info->sizeimage_mult / q_dst->info->sizeimage_div; ctx->state.colorspace = ntohl(p_hdr->colorspace); ctx->state.xfer_func = ntohl(p_hdr->xfer_func); ctx->state.ycbcr_enc = ntohl(p_hdr->ycbcr_enc); ctx->state.quantization = ntohl(p_hdr->quantization); } static void set_last_buffer(struct vb2_v4l2_buffer *dst_buf, const struct vb2_v4l2_buffer *src_buf, struct vicodec_ctx *ctx) { struct vicodec_q_data *q_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); vb2_set_plane_payload(&dst_buf->vb2_buf, 0, 0); dst_buf->sequence = q_dst->sequence++; v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, !ctx->is_enc); dst_buf->flags |= V4L2_BUF_FLAG_LAST; v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_DONE); } static int job_ready(void *priv) { static const u8 magic[] = { 0x4f, 0x4f, 0x4f, 0x4f, 0xff, 0xff, 0xff, 0xff }; struct vicodec_ctx *ctx = priv; struct vb2_v4l2_buffer *src_buf; u8 *p_src; u8 *p; u32 sz; u32 state; struct vicodec_q_data *q_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); unsigned int flags; unsigned int hdr_width_div; unsigned int hdr_height_div; unsigned int max_to_copy; unsigned int comp_frame_size; if (ctx->source_changed) return 0; if (ctx->is_stateless || ctx->is_enc || ctx->comp_has_frame) return 1; restart: ctx->comp_has_next_frame = false; src_buf = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx); if (!src_buf) return 0; p_src = vb2_plane_vaddr(&src_buf->vb2_buf, 0); sz = vb2_get_plane_payload(&src_buf->vb2_buf, 0); p = p_src + ctx->cur_buf_offset; state = VB2_BUF_STATE_DONE; if (ctx->header_size < sizeof(struct fwht_cframe_hdr)) { state = get_next_header(ctx, &p, p_src + sz - p); if (ctx->header_size < sizeof(struct fwht_cframe_hdr)) { if (v4l2_m2m_is_last_draining_src_buf(ctx->fh.m2m_ctx, src_buf)) return 1; job_remove_src_buf(ctx, state); goto restart; } } comp_frame_size = ntohl(ctx->state.header.size); /* * The current scanned frame might be the first frame of a new * resolution so its size might be larger than ctx->comp_max_size. * In that case it is copied up to the current buffer capacity and * the copy will continue after allocating new large enough buffer * when restreaming */ max_to_copy = min(comp_frame_size, ctx->comp_max_size); if (ctx->comp_size < max_to_copy) { u32 copy = max_to_copy - ctx->comp_size; if (copy > p_src + sz - p) copy = p_src + sz - p; memcpy(ctx->state.compressed_frame + ctx->comp_size, p, copy); p += copy; ctx->comp_size += copy; if (ctx->comp_size < max_to_copy) { if (v4l2_m2m_is_last_draining_src_buf(ctx->fh.m2m_ctx, src_buf)) return 1; job_remove_src_buf(ctx, state); goto restart; } } ctx->cur_buf_offset = p - p_src; if (ctx->comp_size == comp_frame_size) ctx->comp_has_frame = true; ctx->comp_has_next_frame = false; if (ctx->comp_has_frame && sz - ctx->cur_buf_offset >= sizeof(struct fwht_cframe_hdr)) { struct fwht_cframe_hdr *p_hdr = (struct fwht_cframe_hdr *)p; u32 frame_size = ntohl(p_hdr->size); u32 remaining = sz - ctx->cur_buf_offset - sizeof(*p_hdr); if (!memcmp(p, magic, sizeof(magic))) ctx->comp_has_next_frame = remaining >= frame_size; } /* * if the header is invalid the device_run will just drop the frame * with an error */ if (!is_header_valid(&ctx->state.header) && ctx->comp_has_frame) return 1; flags = ntohl(ctx->state.header.flags); hdr_width_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH) ? 1 : 2; hdr_height_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT) ? 1 : 2; if (ntohl(ctx->state.header.width) != q_dst->visible_width || ntohl(ctx->state.header.height) != q_dst->visible_height || !q_dst->info || hdr_width_div != q_dst->info->width_div || hdr_height_div != q_dst->info->height_div) { static const struct v4l2_event rs_event = { .type = V4L2_EVENT_SOURCE_CHANGE, .u.src_change.changes = V4L2_EVENT_SRC_CH_RESOLUTION, }; struct vb2_v4l2_buffer *dst_buf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx); update_capture_data_from_header(ctx); v4l2_event_queue_fh(&ctx->fh, &rs_event); set_last_buffer(dst_buf, src_buf, ctx); ctx->source_changed = true; return 0; } return 1; } /* * video ioctls */ static const struct v4l2_fwht_pixfmt_info *find_fmt(u32 fmt) { const struct v4l2_fwht_pixfmt_info *info = v4l2_fwht_find_pixfmt(fmt); if (!info) info = v4l2_fwht_get_pixfmt(0); return info; } static int vidioc_querycap(struct file *file, void *priv, struct v4l2_capability *cap) { strscpy(cap->driver, VICODEC_NAME, sizeof(cap->driver)); strscpy(cap->card, VICODEC_NAME, sizeof(cap->card)); snprintf(cap->bus_info, sizeof(cap->bus_info), "platform:%s", VICODEC_NAME); return 0; } static int enum_fmt(struct v4l2_fmtdesc *f, struct vicodec_ctx *ctx, bool is_out) { bool is_uncomp = (ctx->is_enc && is_out) || (!ctx->is_enc && !is_out); if (V4L2_TYPE_IS_MULTIPLANAR(f->type) && !multiplanar) return -EINVAL; if (!V4L2_TYPE_IS_MULTIPLANAR(f->type) && multiplanar) return -EINVAL; if (is_uncomp) { const struct v4l2_fwht_pixfmt_info *info = get_q_data(ctx, f->type)->info; if (ctx->is_enc || !vb2_is_streaming(&ctx->fh.m2m_ctx->cap_q_ctx.q)) info = v4l2_fwht_get_pixfmt(f->index); else info = v4l2_fwht_find_nth_fmt(info->width_div, info->height_div, info->components_num, info->pixenc, f->index); if (!info) return -EINVAL; f->pixelformat = info->id; } else { if (f->index) return -EINVAL; f->pixelformat = ctx->is_stateless ? V4L2_PIX_FMT_FWHT_STATELESS : V4L2_PIX_FMT_FWHT; if (!ctx->is_enc && !ctx->is_stateless) f->flags = V4L2_FMT_FLAG_DYN_RESOLUTION | V4L2_FMT_FLAG_CONTINUOUS_BYTESTREAM; } return 0; } static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, struct v4l2_fmtdesc *f) { struct vicodec_ctx *ctx = file2ctx(file); return enum_fmt(f, ctx, false); } static int vidioc_enum_fmt_vid_out(struct file *file, void *priv, struct v4l2_fmtdesc *f) { struct vicodec_ctx *ctx = file2ctx(file); return enum_fmt(f, ctx, true); } static int vidioc_g_fmt(struct vicodec_ctx *ctx, struct v4l2_format *f) { struct vb2_queue *vq; struct vicodec_q_data *q_data; struct v4l2_pix_format_mplane *pix_mp; struct v4l2_pix_format *pix; const struct v4l2_fwht_pixfmt_info *info; vq = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, f->type); if (!vq) return -EINVAL; q_data = get_q_data(ctx, f->type); info = q_data->info; switch (f->type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: case V4L2_BUF_TYPE_VIDEO_OUTPUT: if (multiplanar) return -EINVAL; pix = &f->fmt.pix; pix->width = q_data->coded_width; pix->height = q_data->coded_height; pix->field = V4L2_FIELD_NONE; pix->pixelformat = info->id; pix->bytesperline = q_data->coded_width * info->bytesperline_mult; pix->sizeimage = q_data->sizeimage; pix->colorspace = ctx->state.colorspace; pix->xfer_func = ctx->state.xfer_func; pix->ycbcr_enc = ctx->state.ycbcr_enc; pix->quantization = ctx->state.quantization; break; case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE: case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE: if (!multiplanar) return -EINVAL; pix_mp = &f->fmt.pix_mp; pix_mp->width = q_data->coded_width; pix_mp->height = q_data->coded_height; pix_mp->field = V4L2_FIELD_NONE; pix_mp->pixelformat = info->id; pix_mp->num_planes = 1; pix_mp->plane_fmt[0].bytesperline = q_data->coded_width * info->bytesperline_mult; pix_mp->plane_fmt[0].sizeimage = q_data->sizeimage; pix_mp->colorspace = ctx->state.colorspace; pix_mp->xfer_func = ctx->state.xfer_func; pix_mp->ycbcr_enc = ctx->state.ycbcr_enc; pix_mp->quantization = ctx->state.quantization; break; default: return -EINVAL; } return 0; } static int vidioc_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { return vidioc_g_fmt(file2ctx(file), f); } static int vidioc_g_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { return vidioc_g_fmt(file2ctx(file), f); } static int vidioc_try_fmt(struct vicodec_ctx *ctx, struct v4l2_format *f) { struct v4l2_pix_format_mplane *pix_mp; struct v4l2_pix_format *pix; struct v4l2_plane_pix_format *plane; const struct v4l2_fwht_pixfmt_info *info = ctx->is_stateless ? &pixfmt_stateless_fwht : &pixfmt_fwht; switch (f->type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: case V4L2_BUF_TYPE_VIDEO_OUTPUT: pix = &f->fmt.pix; if (pix->pixelformat != V4L2_PIX_FMT_FWHT && pix->pixelformat != V4L2_PIX_FMT_FWHT_STATELESS) info = find_fmt(pix->pixelformat); pix->width = clamp(pix->width, MIN_WIDTH, MAX_WIDTH); pix->width = vic_round_dim(pix->width, info->width_div); pix->height = clamp(pix->height, MIN_HEIGHT, MAX_HEIGHT); pix->height = vic_round_dim(pix->height, info->height_div); pix->field = V4L2_FIELD_NONE; pix->bytesperline = pix->width * info->bytesperline_mult; pix->sizeimage = pix->width * pix->height * info->sizeimage_mult / info->sizeimage_div; if (pix->pixelformat == V4L2_PIX_FMT_FWHT) pix->sizeimage += sizeof(struct fwht_cframe_hdr); break; case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE: case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE: pix_mp = &f->fmt.pix_mp; plane = pix_mp->plane_fmt; if (pix_mp->pixelformat != V4L2_PIX_FMT_FWHT && pix_mp->pixelformat != V4L2_PIX_FMT_FWHT_STATELESS) info = find_fmt(pix_mp->pixelformat); pix_mp->num_planes = 1; pix_mp->width = clamp(pix_mp->width, MIN_WIDTH, MAX_WIDTH); pix_mp->width = vic_round_dim(pix_mp->width, info->width_div); pix_mp->height = clamp(pix_mp->height, MIN_HEIGHT, MAX_HEIGHT); pix_mp->height = vic_round_dim(pix_mp->height, info->height_div); pix_mp->field = V4L2_FIELD_NONE; plane->bytesperline = pix_mp->width * info->bytesperline_mult; plane->sizeimage = pix_mp->width * pix_mp->height * info->sizeimage_mult / info->sizeimage_div; if (pix_mp->pixelformat == V4L2_PIX_FMT_FWHT) plane->sizeimage += sizeof(struct fwht_cframe_hdr); break; default: return -EINVAL; } return 0; } static int vidioc_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vicodec_ctx *ctx = file2ctx(file); struct v4l2_pix_format_mplane *pix_mp; struct v4l2_pix_format *pix; switch (f->type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: if (multiplanar) return -EINVAL; pix = &f->fmt.pix; pix->pixelformat = ctx->is_enc ? V4L2_PIX_FMT_FWHT : find_fmt(f->fmt.pix.pixelformat)->id; pix->colorspace = ctx->state.colorspace; pix->xfer_func = ctx->state.xfer_func; pix->ycbcr_enc = ctx->state.ycbcr_enc; pix->quantization = ctx->state.quantization; break; case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE: if (!multiplanar) return -EINVAL; pix_mp = &f->fmt.pix_mp; pix_mp->pixelformat = ctx->is_enc ? V4L2_PIX_FMT_FWHT : find_fmt(pix_mp->pixelformat)->id; pix_mp->colorspace = ctx->state.colorspace; pix_mp->xfer_func = ctx->state.xfer_func; pix_mp->ycbcr_enc = ctx->state.ycbcr_enc; pix_mp->quantization = ctx->state.quantization; break; default: return -EINVAL; } return vidioc_try_fmt(ctx, f); } static int vidioc_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vicodec_ctx *ctx = file2ctx(file); struct v4l2_pix_format_mplane *pix_mp; struct v4l2_pix_format *pix; switch (f->type) { case V4L2_BUF_TYPE_VIDEO_OUTPUT: if (multiplanar) return -EINVAL; pix = &f->fmt.pix; if (ctx->is_enc) pix->pixelformat = find_fmt(pix->pixelformat)->id; else if (ctx->is_stateless) pix->pixelformat = V4L2_PIX_FMT_FWHT_STATELESS; else pix->pixelformat = V4L2_PIX_FMT_FWHT; if (!pix->colorspace) pix->colorspace = V4L2_COLORSPACE_REC709; break; case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE: if (!multiplanar) return -EINVAL; pix_mp = &f->fmt.pix_mp; if (ctx->is_enc) pix_mp->pixelformat = find_fmt(pix_mp->pixelformat)->id; else if (ctx->is_stateless) pix_mp->pixelformat = V4L2_PIX_FMT_FWHT_STATELESS; else pix_mp->pixelformat = V4L2_PIX_FMT_FWHT; if (!pix_mp->colorspace) pix_mp->colorspace = V4L2_COLORSPACE_REC709; break; default: return -EINVAL; } return vidioc_try_fmt(ctx, f); } static int vidioc_s_fmt(struct vicodec_ctx *ctx, struct v4l2_format *f) { struct vicodec_q_data *q_data; struct vb2_queue *vq; bool fmt_changed = true; struct v4l2_pix_format_mplane *pix_mp; struct v4l2_pix_format *pix; vq = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, f->type); if (!vq) return -EINVAL; q_data = get_q_data(ctx, f->type); if (!q_data) return -EINVAL; switch (f->type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: case V4L2_BUF_TYPE_VIDEO_OUTPUT: pix = &f->fmt.pix; if (ctx->is_enc && V4L2_TYPE_IS_OUTPUT(f->type)) fmt_changed = !q_data->info || q_data->info->id != pix->pixelformat || q_data->coded_width != pix->width || q_data->coded_height != pix->height; if (vb2_is_busy(vq) && fmt_changed) return -EBUSY; if (pix->pixelformat == V4L2_PIX_FMT_FWHT) q_data->info = &pixfmt_fwht; else if (pix->pixelformat == V4L2_PIX_FMT_FWHT_STATELESS) q_data->info = &pixfmt_stateless_fwht; else q_data->info = find_fmt(pix->pixelformat); q_data->coded_width = pix->width; q_data->coded_height = pix->height; q_data->sizeimage = pix->sizeimage; break; case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE: case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE: pix_mp = &f->fmt.pix_mp; if (ctx->is_enc && V4L2_TYPE_IS_OUTPUT(f->type)) fmt_changed = !q_data->info || q_data->info->id != pix_mp->pixelformat || q_data->coded_width != pix_mp->width || q_data->coded_height != pix_mp->height; if (vb2_is_busy(vq) && fmt_changed) return -EBUSY; if (pix_mp->pixelformat == V4L2_PIX_FMT_FWHT) q_data->info = &pixfmt_fwht; else if (pix_mp->pixelformat == V4L2_PIX_FMT_FWHT_STATELESS) q_data->info = &pixfmt_stateless_fwht; else q_data->info = find_fmt(pix_mp->pixelformat); q_data->coded_width = pix_mp->width; q_data->coded_height = pix_mp->height; q_data->sizeimage = pix_mp->plane_fmt[0].sizeimage; break; default: return -EINVAL; } dprintk(ctx->dev, "Setting format for type %d, coded wxh: %dx%d, fourcc: 0x%08x\n", f->type, q_data->coded_width, q_data->coded_height, q_data->info->id); return 0; } static int vidioc_s_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { int ret; ret = vidioc_try_fmt_vid_cap(file, priv, f); if (ret) return ret; return vidioc_s_fmt(file2ctx(file), f); } static int vidioc_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vicodec_ctx *ctx = file2ctx(file); struct vicodec_q_data *q_data; struct vicodec_q_data *q_data_cap; struct v4l2_pix_format *pix; struct v4l2_pix_format_mplane *pix_mp; u32 coded_w = 0, coded_h = 0; unsigned int size = 0; int ret; q_data = get_q_data(ctx, f->type); q_data_cap = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); ret = vidioc_try_fmt_vid_out(file, priv, f); if (ret) return ret; if (ctx->is_enc) { struct vb2_queue *vq = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, f->type); struct vb2_queue *vq_cap = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); const struct v4l2_fwht_pixfmt_info *info = ctx->is_stateless ? &pixfmt_stateless_fwht : &pixfmt_fwht; if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT) { coded_w = f->fmt.pix.width; coded_h = f->fmt.pix.height; } else { coded_w = f->fmt.pix_mp.width; coded_h = f->fmt.pix_mp.height; } if (vb2_is_busy(vq) && (coded_w != q_data->coded_width || coded_h != q_data->coded_height)) return -EBUSY; size = coded_w * coded_h * info->sizeimage_mult / info->sizeimage_div; if (!ctx->is_stateless) size += sizeof(struct fwht_cframe_hdr); if (vb2_is_busy(vq_cap) && size > q_data_cap->sizeimage) return -EBUSY; } ret = vidioc_s_fmt(file2ctx(file), f); if (!ret) { if (ctx->is_enc) { q_data->visible_width = coded_w; q_data->visible_height = coded_h; q_data_cap->coded_width = coded_w; q_data_cap->coded_height = coded_h; q_data_cap->sizeimage = size; } switch (f->type) { case V4L2_BUF_TYPE_VIDEO_OUTPUT: pix = &f->fmt.pix; ctx->state.colorspace = pix->colorspace; ctx->state.xfer_func = pix->xfer_func; ctx->state.ycbcr_enc = pix->ycbcr_enc; ctx->state.quantization = pix->quantization; break; case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE: pix_mp = &f->fmt.pix_mp; ctx->state.colorspace = pix_mp->colorspace; ctx->state.xfer_func = pix_mp->xfer_func; ctx->state.ycbcr_enc = pix_mp->ycbcr_enc; ctx->state.quantization = pix_mp->quantization; break; default: break; } } return ret; } static int vidioc_g_selection(struct file *file, void *priv, struct v4l2_selection *s) { struct vicodec_ctx *ctx = file2ctx(file); struct vicodec_q_data *q_data; q_data = get_q_data(ctx, s->type); if (!q_data) return -EINVAL; /* * encoder supports only cropping on the OUTPUT buffer * decoder supports only composing on the CAPTURE buffer */ if (ctx->is_enc && s->type == V4L2_BUF_TYPE_VIDEO_OUTPUT) { switch (s->target) { case V4L2_SEL_TGT_CROP: s->r.left = 0; s->r.top = 0; s->r.width = q_data->visible_width; s->r.height = q_data->visible_height; return 0; case V4L2_SEL_TGT_CROP_DEFAULT: case V4L2_SEL_TGT_CROP_BOUNDS: s->r.left = 0; s->r.top = 0; s->r.width = q_data->coded_width; s->r.height = q_data->coded_height; return 0; } } else if (!ctx->is_enc && s->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) { switch (s->target) { case V4L2_SEL_TGT_COMPOSE: s->r.left = 0; s->r.top = 0; s->r.width = q_data->visible_width; s->r.height = q_data->visible_height; return 0; case V4L2_SEL_TGT_COMPOSE_DEFAULT: case V4L2_SEL_TGT_COMPOSE_BOUNDS: s->r.left = 0; s->r.top = 0; s->r.width = q_data->coded_width; s->r.height = q_data->coded_height; return 0; } } return -EINVAL; } static int vidioc_s_selection(struct file *file, void *priv, struct v4l2_selection *s) { struct vicodec_ctx *ctx = file2ctx(file); struct vicodec_q_data *q_data; if (s->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; q_data = get_q_data(ctx, s->type); if (!q_data) return -EINVAL; if (!ctx->is_enc || s->target != V4L2_SEL_TGT_CROP) return -EINVAL; s->r.left = 0; s->r.top = 0; q_data->visible_width = clamp(s->r.width, MIN_WIDTH, q_data->coded_width); s->r.width = q_data->visible_width; q_data->visible_height = clamp(s->r.height, MIN_HEIGHT, q_data->coded_height); s->r.height = q_data->visible_height; return 0; } static int vicodec_encoder_cmd(struct file *file, void *fh, struct v4l2_encoder_cmd *ec) { struct vicodec_ctx *ctx = file2ctx(file); int ret; ret = v4l2_m2m_ioctl_try_encoder_cmd(file, fh, ec); if (ret < 0) return ret; if (!vb2_is_streaming(&ctx->fh.m2m_ctx->out_q_ctx.q)) return 0; ret = v4l2_m2m_ioctl_encoder_cmd(file, fh, ec); if (ret < 0) return ret; if (ec->cmd == V4L2_ENC_CMD_STOP && v4l2_m2m_has_stopped(ctx->fh.m2m_ctx)) v4l2_event_queue_fh(&ctx->fh, &vicodec_eos_event); if (ec->cmd == V4L2_ENC_CMD_START && v4l2_m2m_has_stopped(ctx->fh.m2m_ctx)) vb2_clear_last_buffer_dequeued(&ctx->fh.m2m_ctx->cap_q_ctx.q); return 0; } static int vicodec_decoder_cmd(struct file *file, void *fh, struct v4l2_decoder_cmd *dc) { struct vicodec_ctx *ctx = file2ctx(file); int ret; /* * This ioctl should not be used with a stateless codec that doesn't * support holding buffers and the associated flush command. */ WARN_ON(ctx->is_stateless); ret = v4l2_m2m_ioctl_try_decoder_cmd(file, fh, dc); if (ret < 0) return ret; if (!vb2_is_streaming(&ctx->fh.m2m_ctx->out_q_ctx.q)) return 0; ret = v4l2_m2m_ioctl_decoder_cmd(file, fh, dc); if (ret < 0) return ret; if (dc->cmd == V4L2_DEC_CMD_STOP && v4l2_m2m_has_stopped(ctx->fh.m2m_ctx)) v4l2_event_queue_fh(&ctx->fh, &vicodec_eos_event); if (dc->cmd == V4L2_DEC_CMD_START && v4l2_m2m_has_stopped(ctx->fh.m2m_ctx)) vb2_clear_last_buffer_dequeued(&ctx->fh.m2m_ctx->cap_q_ctx.q); return 0; } static int vicodec_enum_framesizes(struct file *file, void *fh, struct v4l2_frmsizeenum *fsize) { switch (fsize->pixel_format) { case V4L2_PIX_FMT_FWHT_STATELESS: break; case V4L2_PIX_FMT_FWHT: break; default: if (find_fmt(fsize->pixel_format)->id == fsize->pixel_format) break; return -EINVAL; } if (fsize->index) return -EINVAL; fsize->type = V4L2_FRMSIZE_TYPE_STEPWISE; fsize->stepwise.min_width = MIN_WIDTH; fsize->stepwise.max_width = MAX_WIDTH; fsize->stepwise.step_width = 8; fsize->stepwise.min_height = MIN_HEIGHT; fsize->stepwise.max_height = MAX_HEIGHT; fsize->stepwise.step_height = 8; return 0; } static int vicodec_subscribe_event(struct v4l2_fh *fh, const struct v4l2_event_subscription *sub) { struct vicodec_ctx *ctx = container_of(fh, struct vicodec_ctx, fh); switch (sub->type) { case V4L2_EVENT_SOURCE_CHANGE: if (ctx->is_enc) return -EINVAL; fallthrough; case V4L2_EVENT_EOS: if (ctx->is_stateless) return -EINVAL; return v4l2_event_subscribe(fh, sub, 0, NULL); default: return v4l2_ctrl_subscribe_event(fh, sub); } } static const struct v4l2_ioctl_ops vicodec_ioctl_ops = { .vidioc_querycap = vidioc_querycap, .vidioc_enum_fmt_vid_cap = vidioc_enum_fmt_vid_cap, .vidioc_g_fmt_vid_cap = vidioc_g_fmt_vid_cap, .vidioc_try_fmt_vid_cap = vidioc_try_fmt_vid_cap, .vidioc_s_fmt_vid_cap = vidioc_s_fmt_vid_cap, .vidioc_g_fmt_vid_cap_mplane = vidioc_g_fmt_vid_cap, .vidioc_try_fmt_vid_cap_mplane = vidioc_try_fmt_vid_cap, .vidioc_s_fmt_vid_cap_mplane = vidioc_s_fmt_vid_cap, .vidioc_enum_fmt_vid_out = vidioc_enum_fmt_vid_out, .vidioc_g_fmt_vid_out = vidioc_g_fmt_vid_out, .vidioc_try_fmt_vid_out = vidioc_try_fmt_vid_out, .vidioc_s_fmt_vid_out = vidioc_s_fmt_vid_out, .vidioc_g_fmt_vid_out_mplane = vidioc_g_fmt_vid_out, .vidioc_try_fmt_vid_out_mplane = vidioc_try_fmt_vid_out, .vidioc_s_fmt_vid_out_mplane = vidioc_s_fmt_vid_out, .vidioc_reqbufs = v4l2_m2m_ioctl_reqbufs, .vidioc_querybuf = v4l2_m2m_ioctl_querybuf, .vidioc_qbuf = v4l2_m2m_ioctl_qbuf, .vidioc_dqbuf = v4l2_m2m_ioctl_dqbuf, .vidioc_prepare_buf = v4l2_m2m_ioctl_prepare_buf, .vidioc_create_bufs = v4l2_m2m_ioctl_create_bufs, .vidioc_expbuf = v4l2_m2m_ioctl_expbuf, .vidioc_remove_bufs = v4l2_m2m_ioctl_remove_bufs, .vidioc_streamon = v4l2_m2m_ioctl_streamon, .vidioc_streamoff = v4l2_m2m_ioctl_streamoff, .vidioc_g_selection = vidioc_g_selection, .vidioc_s_selection = vidioc_s_selection, .vidioc_try_encoder_cmd = v4l2_m2m_ioctl_try_encoder_cmd, .vidioc_encoder_cmd = vicodec_encoder_cmd, .vidioc_try_decoder_cmd = v4l2_m2m_ioctl_try_decoder_cmd, .vidioc_decoder_cmd = vicodec_decoder_cmd, .vidioc_enum_framesizes = vicodec_enum_framesizes, .vidioc_subscribe_event = vicodec_subscribe_event, .vidioc_unsubscribe_event = v4l2_event_unsubscribe, }; /* * Queue operations */ static int vicodec_queue_setup(struct vb2_queue *vq, unsigned int *nbuffers, unsigned int *nplanes, unsigned int sizes[], struct device *alloc_devs[]) { struct vicodec_ctx *ctx = vb2_get_drv_priv(vq); struct vicodec_q_data *q_data = get_q_data(ctx, vq->type); unsigned int size = q_data->sizeimage; if (*nplanes) return sizes[0] < size ? -EINVAL : 0; *nplanes = 1; sizes[0] = size; q_data->vb2_sizeimage = size; return 0; } static int vicodec_buf_out_validate(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); vbuf->field = V4L2_FIELD_NONE; return 0; } static int vicodec_buf_prepare(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vicodec_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); struct vicodec_q_data *q_data; dprintk(ctx->dev, "type: %d\n", vb->vb2_queue->type); q_data = get_q_data(ctx, vb->vb2_queue->type); if (V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) { if (vbuf->field == V4L2_FIELD_ANY) vbuf->field = V4L2_FIELD_NONE; if (vbuf->field != V4L2_FIELD_NONE) { dprintk(ctx->dev, "%s field isn't supported\n", __func__); return -EINVAL; } } if (vb2_plane_size(vb, 0) < q_data->vb2_sizeimage) { dprintk(ctx->dev, "%s data will not fit into plane (%lu < %lu)\n", __func__, vb2_plane_size(vb, 0), (long)q_data->vb2_sizeimage); return -EINVAL; } return 0; } static void vicodec_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vicodec_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); unsigned int sz = vb2_get_plane_payload(&vbuf->vb2_buf, 0); u8 *p_src = vb2_plane_vaddr(&vbuf->vb2_buf, 0); u8 *p = p_src; struct vb2_queue *vq_out = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT); struct vb2_queue *vq_cap = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); bool header_valid = false; static const struct v4l2_event rs_event = { .type = V4L2_EVENT_SOURCE_CHANGE, .u.src_change.changes = V4L2_EVENT_SRC_CH_RESOLUTION, }; if (V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type) && vb2_is_streaming(vb->vb2_queue) && v4l2_m2m_dst_buf_is_last(ctx->fh.m2m_ctx)) { unsigned int i; for (i = 0; i < vb->num_planes; i++) vb2_set_plane_payload(vb, i, 0); vbuf->field = V4L2_FIELD_NONE; vbuf->sequence = get_q_data(ctx, vb->vb2_queue->type)->sequence++; v4l2_m2m_last_buffer_done(ctx->fh.m2m_ctx, vbuf); v4l2_event_queue_fh(&ctx->fh, &vicodec_eos_event); return; } /* buf_queue handles only the first source change event */ if (ctx->first_source_change_sent) { v4l2_m2m_buf_queue(ctx->fh.m2m_ctx, vbuf); return; } /* * if both queues are streaming, the source change event is * handled in job_ready */ if (vb2_is_streaming(vq_cap) && vb2_is_streaming(vq_out)) { v4l2_m2m_buf_queue(ctx->fh.m2m_ctx, vbuf); return; } /* * source change event is relevant only for the stateful decoder * in the compressed stream */ if (ctx->is_stateless || ctx->is_enc || V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type)) { v4l2_m2m_buf_queue(ctx->fh.m2m_ctx, vbuf); return; } do { enum vb2_buffer_state state = get_next_header(ctx, &p, p_src + sz - p); if (ctx->header_size < sizeof(struct fwht_cframe_hdr)) { v4l2_m2m_buf_done(vbuf, state); return; } header_valid = is_header_valid(&ctx->state.header); /* * p points right after the end of the header in the * buffer. If the header is invalid we set p to point * to the next byte after the start of the header */ if (!header_valid) { p = p - sizeof(struct fwht_cframe_hdr) + 1; if (p < p_src) p = p_src; ctx->header_size = 0; ctx->comp_magic_cnt = 0; } } while (!header_valid); ctx->cur_buf_offset = p - p_src; update_capture_data_from_header(ctx); ctx->first_source_change_sent = true; v4l2_event_queue_fh(&ctx->fh, &rs_event); v4l2_m2m_buf_queue(ctx->fh.m2m_ctx, vbuf); } static void vicodec_return_bufs(struct vb2_queue *q, u32 state) { struct vicodec_ctx *ctx = vb2_get_drv_priv(q); struct vb2_v4l2_buffer *vbuf; for (;;) { if (V4L2_TYPE_IS_OUTPUT(q->type)) vbuf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx); else vbuf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx); if (vbuf == NULL) return; v4l2_ctrl_request_complete(vbuf->vb2_buf.req_obj.req, &ctx->hdl); spin_lock(ctx->lock); v4l2_m2m_buf_done(vbuf, state); spin_unlock(ctx->lock); } } static unsigned int total_frame_size(struct vicodec_q_data *q_data) { unsigned int size; unsigned int chroma_div; if (!q_data->info) { WARN_ON(1); return 0; } size = q_data->coded_width * q_data->coded_height; chroma_div = q_data->info->width_div * q_data->info->height_div; if (q_data->info->components_num == 4) return 2 * size + 2 * (size / chroma_div); else if (q_data->info->components_num == 3) return size + 2 * (size / chroma_div); return size; } static int vicodec_start_streaming(struct vb2_queue *q, unsigned int count) { struct vicodec_ctx *ctx = vb2_get_drv_priv(q); struct vicodec_q_data *q_data = get_q_data(ctx, q->type); struct v4l2_fwht_state *state = &ctx->state; const struct v4l2_fwht_pixfmt_info *info = q_data->info; unsigned int size = q_data->coded_width * q_data->coded_height; unsigned int chroma_div; unsigned int total_planes_size; u8 *new_comp_frame = NULL; chroma_div = info->width_div * info->height_div; q_data->sequence = 0; v4l2_m2m_update_start_streaming_state(ctx->fh.m2m_ctx, q); state->gop_cnt = 0; if ((V4L2_TYPE_IS_OUTPUT(q->type) && !ctx->is_enc) || (V4L2_TYPE_IS_CAPTURE(q->type) && ctx->is_enc)) return 0; if (info->id == V4L2_PIX_FMT_FWHT || info->id == V4L2_PIX_FMT_FWHT_STATELESS) { vicodec_return_bufs(q, VB2_BUF_STATE_QUEUED); return -EINVAL; } total_planes_size = total_frame_size(q_data); ctx->comp_max_size = total_planes_size; state->visible_width = q_data->visible_width; state->visible_height = q_data->visible_height; state->coded_width = q_data->coded_width; state->coded_height = q_data->coded_height; state->stride = q_data->coded_width * info->bytesperline_mult; if (ctx->is_stateless) { state->ref_stride = state->stride; return 0; } state->ref_stride = q_data->coded_width * info->luma_alpha_step; state->ref_frame.buf = kvmalloc(total_planes_size, GFP_KERNEL); state->ref_frame.luma = state->ref_frame.buf; new_comp_frame = kvmalloc(ctx->comp_max_size, GFP_KERNEL); if (!state->ref_frame.luma || !new_comp_frame) { kvfree(state->ref_frame.luma); kvfree(new_comp_frame); vicodec_return_bufs(q, VB2_BUF_STATE_QUEUED); return -ENOMEM; } /* * if state->compressed_frame was already allocated then * it contain data of the first frame of the new resolution */ if (state->compressed_frame) { if (ctx->comp_size > ctx->comp_max_size) ctx->comp_size = ctx->comp_max_size; memcpy(new_comp_frame, state->compressed_frame, ctx->comp_size); } kvfree(state->compressed_frame); state->compressed_frame = new_comp_frame; if (info->components_num < 3) { state->ref_frame.cb = NULL; state->ref_frame.cr = NULL; state->ref_frame.alpha = NULL; return 0; } state->ref_frame.cb = state->ref_frame.luma + size; state->ref_frame.cr = state->ref_frame.cb + size / chroma_div; if (info->components_num == 4) state->ref_frame.alpha = state->ref_frame.cr + size / chroma_div; else state->ref_frame.alpha = NULL; return 0; } static void vicodec_stop_streaming(struct vb2_queue *q) { struct vicodec_ctx *ctx = vb2_get_drv_priv(q); vicodec_return_bufs(q, VB2_BUF_STATE_ERROR); v4l2_m2m_update_stop_streaming_state(ctx->fh.m2m_ctx, q); if (V4L2_TYPE_IS_OUTPUT(q->type) && v4l2_m2m_has_stopped(ctx->fh.m2m_ctx)) v4l2_event_queue_fh(&ctx->fh, &vicodec_eos_event); if (!ctx->is_enc && V4L2_TYPE_IS_OUTPUT(q->type)) ctx->first_source_change_sent = false; if ((!V4L2_TYPE_IS_OUTPUT(q->type) && !ctx->is_enc) || (V4L2_TYPE_IS_OUTPUT(q->type) && ctx->is_enc)) { if (!ctx->is_stateless) kvfree(ctx->state.ref_frame.buf); ctx->state.ref_frame.buf = NULL; ctx->state.ref_frame.luma = NULL; ctx->comp_max_size = 0; ctx->source_changed = false; } if (V4L2_TYPE_IS_OUTPUT(q->type) && !ctx->is_enc) { ctx->cur_buf_offset = 0; ctx->comp_size = 0; ctx->header_size = 0; ctx->comp_magic_cnt = 0; ctx->comp_has_frame = false; ctx->comp_has_next_frame = false; } } static void vicodec_buf_request_complete(struct vb2_buffer *vb) { struct vicodec_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &ctx->hdl); } static const struct vb2_ops vicodec_qops = { .queue_setup = vicodec_queue_setup, .buf_out_validate = vicodec_buf_out_validate, .buf_prepare = vicodec_buf_prepare, .buf_queue = vicodec_buf_queue, .buf_request_complete = vicodec_buf_request_complete, .start_streaming = vicodec_start_streaming, .stop_streaming = vicodec_stop_streaming, }; static int queue_init(void *priv, struct vb2_queue *src_vq, struct vb2_queue *dst_vq) { struct vicodec_ctx *ctx = priv; int ret; src_vq->type = (multiplanar ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE : V4L2_BUF_TYPE_VIDEO_OUTPUT); src_vq->io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF; src_vq->drv_priv = ctx; src_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer); src_vq->ops = &vicodec_qops; src_vq->mem_ops = &vb2_vmalloc_memops; src_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY; if (ctx->is_enc) { src_vq->lock = &ctx->dev->stateful_enc.mutex; src_vq->min_reqbufs_allocation = VICODEC_REC_BUFS; } else if (ctx->is_stateless) { src_vq->lock = &ctx->dev->stateless_dec.mutex; } else { src_vq->lock = &ctx->dev->stateful_dec.mutex; } src_vq->supports_requests = ctx->is_stateless; src_vq->requires_requests = ctx->is_stateless; ret = vb2_queue_init(src_vq); if (ret) return ret; dst_vq->type = (multiplanar ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE : V4L2_BUF_TYPE_VIDEO_CAPTURE); dst_vq->io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF; dst_vq->max_num_buffers = 64; dst_vq->drv_priv = ctx; dst_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer); dst_vq->ops = &vicodec_qops; dst_vq->mem_ops = &vb2_vmalloc_memops; dst_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY; dst_vq->lock = src_vq->lock; if (!ctx->is_stateless && !ctx->is_enc) dst_vq->min_reqbufs_allocation = VICODEC_REC_BUFS; return vb2_queue_init(dst_vq); } static int vicodec_try_ctrl(struct v4l2_ctrl *ctrl) { struct vicodec_ctx *ctx = container_of(ctrl->handler, struct vicodec_ctx, hdl); const struct v4l2_ctrl_fwht_params *params; struct vicodec_q_data *q_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); switch (ctrl->id) { case V4L2_CID_STATELESS_FWHT_PARAMS: if (!q_dst->info) return -EINVAL; params = ctrl->p_new.p_fwht_params; if (params->width > q_dst->coded_width || params->width < MIN_WIDTH || params->height > q_dst->coded_height || params->height < MIN_HEIGHT) return -EINVAL; if (!validate_by_version(params->flags, params->version)) return -EINVAL; if (!validate_stateless_params_flags(params, q_dst->info)) return -EINVAL; return 0; default: return 0; } return 0; } static void update_header_from_stateless_params(struct vicodec_ctx *ctx, const struct v4l2_ctrl_fwht_params *params) { struct fwht_cframe_hdr *p_hdr = &ctx->state.header; p_hdr->magic1 = FWHT_MAGIC1; p_hdr->magic2 = FWHT_MAGIC2; p_hdr->version = htonl(params->version); p_hdr->width = htonl(params->width); p_hdr->height = htonl(params->height); p_hdr->flags = htonl(params->flags); p_hdr->colorspace = htonl(params->colorspace); p_hdr->xfer_func = htonl(params->xfer_func); p_hdr->ycbcr_enc = htonl(params->ycbcr_enc); p_hdr->quantization = htonl(params->quantization); } static int vicodec_s_ctrl(struct v4l2_ctrl *ctrl) { struct vicodec_ctx *ctx = container_of(ctrl->handler, struct vicodec_ctx, hdl); const struct v4l2_ctrl_fwht_params *params; switch (ctrl->id) { case V4L2_CID_MPEG_VIDEO_GOP_SIZE: ctx->state.gop_size = ctrl->val; return 0; case V4L2_CID_FWHT_I_FRAME_QP: ctx->state.i_frame_qp = ctrl->val; return 0; case V4L2_CID_FWHT_P_FRAME_QP: ctx->state.p_frame_qp = ctrl->val; return 0; case V4L2_CID_STATELESS_FWHT_PARAMS: params = ctrl->p_new.p_fwht_params; update_header_from_stateless_params(ctx, params); ctx->state.ref_frame_ts = params->backward_ref_ts; return 0; } return -EINVAL; } static const struct v4l2_ctrl_ops vicodec_ctrl_ops = { .s_ctrl = vicodec_s_ctrl, .try_ctrl = vicodec_try_ctrl, }; static const struct v4l2_ctrl_config vicodec_ctrl_stateless_state = { .ops = &vicodec_ctrl_ops, .id = V4L2_CID_STATELESS_FWHT_PARAMS, .elem_size = sizeof(struct v4l2_ctrl_fwht_params), }; /* * File operations */ static int vicodec_open(struct file *file) { const struct v4l2_fwht_pixfmt_info *info = v4l2_fwht_get_pixfmt(0); struct video_device *vfd = video_devdata(file); struct vicodec_dev *dev = video_drvdata(file); struct vicodec_ctx *ctx = NULL; struct v4l2_ctrl_handler *hdl; unsigned int raw_size; unsigned int comp_size; int rc = 0; if (mutex_lock_interruptible(vfd->lock)) return -ERESTARTSYS; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto open_unlock; } if (vfd == &dev->stateful_enc.vfd) ctx->is_enc = true; else if (vfd == &dev->stateless_dec.vfd) ctx->is_stateless = true; v4l2_fh_init(&ctx->fh, video_devdata(file)); file->private_data = &ctx->fh; ctx->dev = dev; hdl = &ctx->hdl; v4l2_ctrl_handler_init(hdl, 5); v4l2_ctrl_new_std(hdl, &vicodec_ctrl_ops, V4L2_CID_MPEG_VIDEO_GOP_SIZE, 1, 16, 1, 10); v4l2_ctrl_new_std(hdl, &vicodec_ctrl_ops, V4L2_CID_FWHT_I_FRAME_QP, 1, 31, 1, 20); v4l2_ctrl_new_std(hdl, &vicodec_ctrl_ops, V4L2_CID_FWHT_P_FRAME_QP, 1, 31, 1, 20); if (ctx->is_stateless) v4l2_ctrl_new_custom(hdl, &vicodec_ctrl_stateless_state, NULL); else v4l2_ctrl_new_std(hdl, &vicodec_ctrl_ops, ctx->is_enc ? V4L2_CID_MIN_BUFFERS_FOR_OUTPUT : V4L2_CID_MIN_BUFFERS_FOR_CAPTURE, VICODEC_REC_BUFS, VICODEC_REC_BUFS, 1, VICODEC_REC_BUFS); if (hdl->error) { rc = hdl->error; v4l2_ctrl_handler_free(hdl); kfree(ctx); goto open_unlock; } ctx->fh.ctrl_handler = hdl; v4l2_ctrl_handler_setup(hdl); if (ctx->is_enc) ctx->q_data[V4L2_M2M_SRC].info = info; else if (ctx->is_stateless) ctx->q_data[V4L2_M2M_SRC].info = &pixfmt_stateless_fwht; else ctx->q_data[V4L2_M2M_SRC].info = &pixfmt_fwht; ctx->q_data[V4L2_M2M_SRC].coded_width = 1280; ctx->q_data[V4L2_M2M_SRC].coded_height = 720; ctx->q_data[V4L2_M2M_SRC].visible_width = 1280; ctx->q_data[V4L2_M2M_SRC].visible_height = 720; raw_size = 1280 * 720 * info->sizeimage_mult / info->sizeimage_div; comp_size = 1280 * 720 * pixfmt_fwht.sizeimage_mult / pixfmt_fwht.sizeimage_div; if (ctx->is_enc) ctx->q_data[V4L2_M2M_SRC].sizeimage = raw_size; else if (ctx->is_stateless) ctx->q_data[V4L2_M2M_SRC].sizeimage = comp_size; else ctx->q_data[V4L2_M2M_SRC].sizeimage = comp_size + sizeof(struct fwht_cframe_hdr); ctx->q_data[V4L2_M2M_DST] = ctx->q_data[V4L2_M2M_SRC]; if (ctx->is_enc) { ctx->q_data[V4L2_M2M_DST].info = &pixfmt_fwht; ctx->q_data[V4L2_M2M_DST].sizeimage = comp_size + sizeof(struct fwht_cframe_hdr); } else { ctx->q_data[V4L2_M2M_DST].info = info; ctx->q_data[V4L2_M2M_DST].sizeimage = raw_size; } ctx->state.colorspace = V4L2_COLORSPACE_REC709; if (ctx->is_enc) { ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(dev->stateful_enc.m2m_dev, ctx, &queue_init); ctx->lock = &dev->stateful_enc.lock; } else if (ctx->is_stateless) { ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(dev->stateless_dec.m2m_dev, ctx, &queue_init); ctx->lock = &dev->stateless_dec.lock; } else { ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(dev->stateful_dec.m2m_dev, ctx, &queue_init); ctx->lock = &dev->stateful_dec.lock; } if (IS_ERR(ctx->fh.m2m_ctx)) { rc = PTR_ERR(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(hdl); v4l2_fh_exit(&ctx->fh); kfree(ctx); goto open_unlock; } v4l2_fh_add(&ctx->fh); open_unlock: mutex_unlock(vfd->lock); return rc; } static int vicodec_release(struct file *file) { struct video_device *vfd = video_devdata(file); struct vicodec_ctx *ctx = file2ctx(file); mutex_lock(vfd->lock); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); mutex_unlock(vfd->lock); v4l2_fh_del(&ctx->fh); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); kvfree(ctx->state.compressed_frame); kfree(ctx); return 0; } static int vicodec_request_validate(struct media_request *req) { struct media_request_object *obj; struct v4l2_ctrl_handler *parent_hdl, *hdl; struct vicodec_ctx *ctx = NULL; struct v4l2_ctrl *ctrl; unsigned int count; list_for_each_entry(obj, &req->objects, list) { struct vb2_buffer *vb; if (vb2_request_object_is_buffer(obj)) { vb = container_of(obj, struct vb2_buffer, req_obj); ctx = vb2_get_drv_priv(vb->vb2_queue); break; } } if (!ctx) { pr_err("No buffer was provided with the request\n"); return -ENOENT; } count = vb2_request_buffer_cnt(req); if (!count) { v4l2_info(&ctx->dev->v4l2_dev, "No buffer was provided with the request\n"); return -ENOENT; } else if (count > 1) { v4l2_info(&ctx->dev->v4l2_dev, "More than one buffer was provided with the request\n"); return -EINVAL; } parent_hdl = &ctx->hdl; hdl = v4l2_ctrl_request_hdl_find(req, parent_hdl); if (!hdl) { v4l2_info(&ctx->dev->v4l2_dev, "Missing codec control\n"); return -ENOENT; } ctrl = v4l2_ctrl_request_hdl_ctrl_find(hdl, vicodec_ctrl_stateless_state.id); v4l2_ctrl_request_hdl_put(hdl); if (!ctrl) { v4l2_info(&ctx->dev->v4l2_dev, "Missing required codec control\n"); return -ENOENT; } return vb2_request_validate(req); } static const struct v4l2_file_operations vicodec_fops = { .owner = THIS_MODULE, .open = vicodec_open, .release = vicodec_release, .poll = v4l2_m2m_fop_poll, .unlocked_ioctl = video_ioctl2, .mmap = v4l2_m2m_fop_mmap, }; static const struct video_device vicodec_videodev = { .name = VICODEC_NAME, .vfl_dir = VFL_DIR_M2M, .fops = &vicodec_fops, .ioctl_ops = &vicodec_ioctl_ops, .minor = -1, .release = video_device_release_empty, }; static const struct media_device_ops vicodec_m2m_media_ops = { .req_validate = vicodec_request_validate, .req_queue = v4l2_m2m_request_queue, }; static const struct v4l2_m2m_ops m2m_ops = { .device_run = device_run, .job_ready = job_ready, }; static int register_instance(struct vicodec_dev *dev, struct vicodec_dev_instance *dev_instance, const char *name, bool is_enc, bool is_stateless) { struct video_device *vfd; int ret; spin_lock_init(&dev_instance->lock); mutex_init(&dev_instance->mutex); dev_instance->m2m_dev = v4l2_m2m_init(&m2m_ops); if (IS_ERR(dev_instance->m2m_dev)) { v4l2_err(&dev->v4l2_dev, "Failed to init vicodec enc device\n"); return PTR_ERR(dev_instance->m2m_dev); } dev_instance->vfd = vicodec_videodev; vfd = &dev_instance->vfd; vfd->lock = &dev_instance->mutex; vfd->v4l2_dev = &dev->v4l2_dev; strscpy(vfd->name, name, sizeof(vfd->name)); vfd->device_caps = V4L2_CAP_STREAMING | (multiplanar ? V4L2_CAP_VIDEO_M2M_MPLANE : V4L2_CAP_VIDEO_M2M); if (is_enc || is_stateless) { v4l2_disable_ioctl(vfd, VIDIOC_DECODER_CMD); v4l2_disable_ioctl(vfd, VIDIOC_TRY_DECODER_CMD); } if (!is_enc) { v4l2_disable_ioctl(vfd, VIDIOC_ENCODER_CMD); v4l2_disable_ioctl(vfd, VIDIOC_TRY_ENCODER_CMD); } video_set_drvdata(vfd, dev); ret = video_register_device(vfd, VFL_TYPE_VIDEO, 0); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to register video device '%s'\n", name); v4l2_m2m_release(dev_instance->m2m_dev); return ret; } v4l2_info(&dev->v4l2_dev, "Device '%s' registered as /dev/video%d\n", name, vfd->num); return 0; } static void vicodec_v4l2_dev_release(struct v4l2_device *v4l2_dev) { struct vicodec_dev *dev = container_of(v4l2_dev, struct vicodec_dev, v4l2_dev); v4l2_device_unregister(&dev->v4l2_dev); v4l2_m2m_release(dev->stateful_enc.m2m_dev); v4l2_m2m_release(dev->stateful_dec.m2m_dev); v4l2_m2m_release(dev->stateless_dec.m2m_dev); #ifdef CONFIG_MEDIA_CONTROLLER media_device_cleanup(&dev->mdev); #endif kfree(dev); } static int vicodec_probe(struct platform_device *pdev) { struct vicodec_dev *dev; int ret; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev); if (ret) goto free_dev; dev->v4l2_dev.release = vicodec_v4l2_dev_release; #ifdef CONFIG_MEDIA_CONTROLLER dev->mdev.dev = &pdev->dev; strscpy(dev->mdev.model, "vicodec", sizeof(dev->mdev.model)); strscpy(dev->mdev.bus_info, "platform:vicodec", sizeof(dev->mdev.bus_info)); media_device_init(&dev->mdev); dev->mdev.ops = &vicodec_m2m_media_ops; dev->v4l2_dev.mdev = &dev->mdev; #endif platform_set_drvdata(pdev, dev); ret = register_instance(dev, &dev->stateful_enc, "stateful-encoder", true, false); if (ret) goto unreg_dev; ret = register_instance(dev, &dev->stateful_dec, "stateful-decoder", false, false); if (ret) goto unreg_sf_enc; ret = register_instance(dev, &dev->stateless_dec, "stateless-decoder", false, true); if (ret) goto unreg_sf_dec; #ifdef CONFIG_MEDIA_CONTROLLER ret = v4l2_m2m_register_media_controller(dev->stateful_enc.m2m_dev, &dev->stateful_enc.vfd, MEDIA_ENT_F_PROC_VIDEO_ENCODER); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to init mem2mem media controller for enc\n"); goto unreg_m2m; } ret = v4l2_m2m_register_media_controller(dev->stateful_dec.m2m_dev, &dev->stateful_dec.vfd, MEDIA_ENT_F_PROC_VIDEO_DECODER); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to init mem2mem media controller for dec\n"); goto unreg_m2m_sf_enc_mc; } ret = v4l2_m2m_register_media_controller(dev->stateless_dec.m2m_dev, &dev->stateless_dec.vfd, MEDIA_ENT_F_PROC_VIDEO_DECODER); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to init mem2mem media controller for stateless dec\n"); goto unreg_m2m_sf_dec_mc; } ret = media_device_register(&dev->mdev); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to register mem2mem media device\n"); goto unreg_m2m_sl_dec_mc; } #endif return 0; #ifdef CONFIG_MEDIA_CONTROLLER unreg_m2m_sl_dec_mc: v4l2_m2m_unregister_media_controller(dev->stateless_dec.m2m_dev); unreg_m2m_sf_dec_mc: v4l2_m2m_unregister_media_controller(dev->stateful_dec.m2m_dev); unreg_m2m_sf_enc_mc: v4l2_m2m_unregister_media_controller(dev->stateful_enc.m2m_dev); unreg_m2m: video_unregister_device(&dev->stateless_dec.vfd); v4l2_m2m_release(dev->stateless_dec.m2m_dev); #endif unreg_sf_dec: video_unregister_device(&dev->stateful_dec.vfd); v4l2_m2m_release(dev->stateful_dec.m2m_dev); unreg_sf_enc: video_unregister_device(&dev->stateful_enc.vfd); v4l2_m2m_release(dev->stateful_enc.m2m_dev); unreg_dev: v4l2_device_unregister(&dev->v4l2_dev); free_dev: kfree(dev); return ret; } static void vicodec_remove(struct platform_device *pdev) { struct vicodec_dev *dev = platform_get_drvdata(pdev); v4l2_info(&dev->v4l2_dev, "Removing " VICODEC_NAME); #ifdef CONFIG_MEDIA_CONTROLLER media_device_unregister(&dev->mdev); v4l2_m2m_unregister_media_controller(dev->stateful_enc.m2m_dev); v4l2_m2m_unregister_media_controller(dev->stateful_dec.m2m_dev); v4l2_m2m_unregister_media_controller(dev->stateless_dec.m2m_dev); #endif video_unregister_device(&dev->stateful_enc.vfd); video_unregister_device(&dev->stateful_dec.vfd); video_unregister_device(&dev->stateless_dec.vfd); v4l2_device_put(&dev->v4l2_dev); } static struct platform_driver vicodec_pdrv = { .probe = vicodec_probe, .remove = vicodec_remove, .driver = { .name = VICODEC_NAME, }, }; static void __exit vicodec_exit(void) { platform_driver_unregister(&vicodec_pdrv); platform_device_unregister(&vicodec_pdev); } static int __init vicodec_init(void) { int ret; ret = platform_device_register(&vicodec_pdev); if (ret) return ret; ret = platform_driver_register(&vicodec_pdrv); if (ret) platform_device_unregister(&vicodec_pdev); return ret; } module_init(vicodec_init); module_exit(vicodec_exit);
29 29 29 1 1 13 15 1 15 11 1 2 3 20 6 6 6 4 5 3 13 13 13 13 1 11 11 251 250 239 10 18 15 18 26 18 20 20 19 2 18 18 18 18 14 14 14 2 13 13 1 14 376 376 3 3 1 165 2 265 166 101 1 1 262 7 2 5 5 5 5 7 5 5 7 7 5 5 7 5 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 // SPDX-License-Identifier: GPL-2.0-or-later /* * Anycast support for IPv6 * Linux INET6 implementation * * Authors: * David L Stevens (dlstevens@us.ibm.com) * * based heavily on net/ipv6/mcast.c */ #include <linux/capability.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/random.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/route.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/checksum.h> #define IN6_ADDR_HSIZE_SHIFT 8 #define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) /* anycast address hash table */ static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; static DEFINE_SPINLOCK(acaddr_hash_lock); static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); static u32 inet6_acaddr_hash(const struct net *net, const struct in6_addr *addr) { u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } /* * socket join an anycast group */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; struct inet6_dev *idev; struct ipv6_ac_socklist *pac; struct net *net = sock_net(sk); int ishost = !net->ipv6.devconf_all->forwarding; int err = 0; ASSERT_RTNL(); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; if (ifindex) dev = __dev_get_by_index(net, ifindex); if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); if (!pac) return -ENOMEM; pac->acl_next = NULL; pac->acl_addr = *addr; if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } else if (ishost) { err = -EADDRNOTAVAIL; goto error; } else { /* router, no matching interface: just pick one */ dev = __dev_get_by_flags(net, IFF_UP, IFF_UP | IFF_LOOPBACK); } } if (!dev) { err = -ENODEV; goto error; } idev = __in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; else err = -EADDRNOTAVAIL; goto error; } /* reset ishost, now that we have a specific device */ ishost = !idev->cnf.forwarding; pac->acl_ifindex = dev->ifindex; /* XXX * For hosts, allow link-local or matching prefix anycasts. * This obviates the need for propagating anycast routes while * still allowing some non-router anycast participation. */ if (!ipv6_chk_prefix(addr, dev)) { if (ishost) err = -EADDRNOTAVAIL; if (err) goto error; } err = __ipv6_dev_ac_inc(idev, addr); if (!err) { pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; pac = NULL; } error: if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; } /* * socket leave an anycast group */ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev; struct ipv6_ac_socklist *pac, *prev_pac; struct net *net = sock_net(sk); ASSERT_RTNL(); prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && ipv6_addr_equal(&pac->acl_addr, addr)) break; prev_pac = pac; } if (!pac) return -ENOENT; if (prev_pac) prev_pac->acl_next = pac->acl_next; else np->ipv6_ac_list = pac->acl_next; dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); sock_kfree_s(sk, pac, sizeof(*pac)); return 0; } void __ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; struct ipv6_ac_socklist *pac; struct net *net = sock_net(sk); int prev_index; ASSERT_RTNL(); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; prev_index = 0; while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { dev = __dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } } void ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); if (!np->ipv6_ac_list) return; rtnl_lock(); __ipv6_sock_ac_close(sk); rtnl_unlock(); } static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) { unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); spin_lock(&acaddr_hash_lock); hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); spin_unlock(&acaddr_hash_lock); } static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) { spin_lock(&acaddr_hash_lock); hlist_del_init_rcu(&aca->aca_addr_lst); spin_unlock(&acaddr_hash_lock); } static void aca_get(struct ifacaddr6 *aca) { refcount_inc(&aca->aca_refcnt); } static void aca_free_rcu(struct rcu_head *h) { struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); fib6_info_release(aca->aca_rt); kfree(aca); } static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); if (!aca) return NULL; aca->aca_addr = *addr; fib6_info_hold(f6i); aca->aca_rt = f6i; INIT_HLIST_NODE(&aca->aca_addr_lst); aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; refcount_set(&aca->aca_refcnt, 1); return aca; } /* * device anycast group inc (add if not found) */ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; struct fib6_info *f6i; struct net *net; int err; ASSERT_RTNL(); write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; goto out; } for (aca = rtnl_dereference(idev->ac_list); aca; aca = rtnl_dereference(aca->aca_next)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) { aca->aca_users++; err = 0; goto out; } } net = dev_net(idev->dev); f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); goto out; } aca = aca_alloc(f6i, addr); if (!aca) { fib6_info_release(f6i); err = -ENOMEM; goto out; } /* Hold this for addrconf_join_solict() below before we unlock, * it is already exposed via idev->ac_list. */ aca_get(aca); aca->aca_next = idev->ac_list; rcu_assign_pointer(idev->ac_list, aca); write_unlock_bh(&idev->lock); ipv6_add_acaddr_hash(net, aca); ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); aca_put(aca); return 0; out: write_unlock_bh(&idev->lock); return err; } /* * device anycast group decrement */ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; ASSERT_RTNL(); write_lock_bh(&idev->lock); prev_aca = NULL; for (aca = rtnl_dereference(idev->ac_list); aca; aca = rtnl_dereference(aca->aca_next)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) break; prev_aca = aca; } if (!aca) { write_unlock_bh(&idev->lock); return -ENOENT; } if (--aca->aca_users > 0) { write_unlock_bh(&idev->lock); return 0; } if (prev_aca) rcu_assign_pointer(prev_aca->aca_next, aca->aca_next); else rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); return 0; } /* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev) return -ENODEV; return __ipv6_dev_ac_dec(idev, addr); } void ipv6_ac_destroy_dev(struct inet6_dev *idev) { struct ifacaddr6 *aca; write_lock_bh(&idev->lock); while ((aca = rtnl_dereference(idev->ac_list)) != NULL) { rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); } /* * check if the interface has this anycast address * called with rcu_read_lock() */ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; struct ifacaddr6 *aca; idev = __in6_dev_get(dev); if (idev) { for (aca = rcu_dereference(idev->ac_list); aca; aca = rcu_dereference(aca->aca_next)) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; return aca != NULL; } return false; } /* * check if given interface (or any, if dev==0) has this anycast address */ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { struct net_device *nh_dev; struct ifacaddr6 *aca; bool found = false; rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); else { unsigned int hash = inet6_acaddr_hash(net, addr); hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], aca_addr_lst) { nh_dev = fib6_info_nh_dev(aca->aca_rt); if (!nh_dev || !net_eq(dev_net(nh_dev), net)) continue; if (ipv6_addr_equal(&aca->aca_addr, addr)) { found = true; break; } } } rcu_read_unlock(); return found; } /* check if this anycast address is link-local on given interface or * is global */ bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr) { return ipv6_chk_acast_addr(net, (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ? dev : NULL), addr); } #ifdef CONFIG_PROC_FS struct ac6_iter_state { struct seq_net_private p; struct net_device *dev; }; #define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) { struct ac6_iter_state *state = ac6_seq_private(seq); struct net *net = seq_file_net(seq); struct ifacaddr6 *im = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->ac_list); if (im) break; } return im; } static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) { struct ac6_iter_state *state = ac6_seq_private(seq); struct inet6_dev *idev; im = rcu_dereference(im->aca_next); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) break; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->ac_list); } return im; } static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) { struct ifacaddr6 *im = ac6_get_first(seq); if (im) while (pos && (im = ac6_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ac6_get_idx(seq, *pos); } static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ifacaddr6 *im = ac6_get_next(seq, v); ++*pos; return im; } static void ac6_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ac6_seq_show(struct seq_file *seq, void *v) { struct ifacaddr6 *im = (struct ifacaddr6 *)v; struct ac6_iter_state *state = ac6_seq_private(seq); seq_printf(seq, "%-4d %-15s %pi6 %5d\n", state->dev->ifindex, state->dev->name, &im->aca_addr, im->aca_users); return 0; } static const struct seq_operations ac6_seq_ops = { .start = ac6_seq_start, .next = ac6_seq_next, .stop = ac6_seq_stop, .show = ac6_seq_show, }; int __net_init ac6_proc_init(struct net *net) { if (!proc_create_net("anycast6", 0444, net->proc_net, &ac6_seq_ops, sizeof(struct ac6_iter_state))) return -ENOMEM; return 0; } void ac6_proc_exit(struct net *net) { remove_proc_entry("anycast6", net->proc_net); } #endif /* Init / cleanup code */ int __init ipv6_anycast_init(void) { int i; for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); return 0; } void ipv6_anycast_cleanup(void) { int i; spin_lock(&acaddr_hash_lock); for (i = 0; i < IN6_ADDR_HSIZE; i++) WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); spin_unlock(&acaddr_hash_lock); }
18 20 2 18 18 17 20 1 1 18 18 18 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/qpolicy.c * * Policy-based packet dequeueing interface for DCCP. * * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> */ #include "dccp.h" /* * Simple Dequeueing Policy: * If tx_qlen is different from 0, enqueue up to tx_qlen elements. */ static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) { skb_queue_tail(&sk->sk_write_queue, skb); } static bool qpolicy_simple_full(struct sock *sk) { return dccp_sk(sk)->dccps_tx_qlen && sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; } static struct sk_buff *qpolicy_simple_top(struct sock *sk) { return skb_peek(&sk->sk_write_queue); } /* * Priority-based Dequeueing Policy: * If tx_qlen is different from 0 and the queue has reached its upper bound * of tx_qlen elements, replace older packets lowest-priority-first. */ static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) { struct sk_buff *skb, *best = NULL; skb_queue_walk(&sk->sk_write_queue, skb) if (best == NULL || skb->priority > best->priority) best = skb; return best; } static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) { struct sk_buff *skb, *worst = NULL; skb_queue_walk(&sk->sk_write_queue, skb) if (worst == NULL || skb->priority < worst->priority) worst = skb; return worst; } static bool qpolicy_prio_full(struct sock *sk) { if (qpolicy_simple_full(sk)) dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); return false; } /** * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface * @push: add a new @skb to the write queue * @full: indicates that no more packets will be admitted * @top: peeks at whatever the queueing policy defines as its `top' * @params: parameter passed to policy operation */ struct dccp_qpolicy_operations { void (*push) (struct sock *sk, struct sk_buff *skb); bool (*full) (struct sock *sk); struct sk_buff* (*top) (struct sock *sk); __be32 params; }; static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { [DCCPQ_POLICY_SIMPLE] = { .push = qpolicy_simple_push, .full = qpolicy_simple_full, .top = qpolicy_simple_top, .params = 0, }, [DCCPQ_POLICY_PRIO] = { .push = qpolicy_simple_push, .full = qpolicy_prio_full, .top = qpolicy_prio_best_skb, .params = DCCP_SCM_PRIORITY, }, }; /* * Externally visible interface */ void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) { qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); } bool dccp_qpolicy_full(struct sock *sk) { return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); } void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) { if (skb != NULL) { skb_unlink(skb, &sk->sk_write_queue); kfree_skb(skb); } } struct sk_buff *dccp_qpolicy_top(struct sock *sk) { return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); } struct sk_buff *dccp_qpolicy_pop(struct sock *sk) { struct sk_buff *skb = dccp_qpolicy_top(sk); if (skb != NULL) { /* Clear any skb fields that we used internally */ skb->priority = 0; skb_unlink(skb, &sk->sk_write_queue); } return skb; } bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) { /* check if exactly one bit is set */ if (!param || (param & (param - 1))) return false; return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; }
41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/lockd/svc.c * * This is the central lockd service. * * FIXME: Separate the lockd NFS server functionality from the lockd NFS * client functionality. Oh why didn't Sun create two separate * services in the first place? * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/moduleparam.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/uio.h> #include <linux/smp.h> #include <linux/mutex.h> #include <linux/freezer.h> #include <linux/inetdevice.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <net/ip.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> #include "netns.h" #include "procfs.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) static struct svc_program nlmsvc_program; const struct nlmsvc_binding *nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; static void nlmsvc_request_retry(struct timer_list *tl) { svc_wake_up(nlmsvc_serv); } DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry); unsigned int lockd_net_id; /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ static unsigned long nlm_grace_period; unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; /* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ static unsigned int nlm_max_connections = 1024; /* * Constants needed for the sysctl interface. */ static const unsigned long nlm_grace_period_min = 0; static const unsigned long nlm_grace_period_max = 240; static const unsigned long nlm_timeout_min = 3; static const unsigned long nlm_timeout_max = 20; #ifdef CONFIG_SYSCTL static const int nlm_port_min = 0, nlm_port_max = 65535; static struct ctl_table_header * nlm_sysctl_table; #endif static unsigned long get_lockd_grace_period(void) { /* Note: nlm_timeout should always be nonzero */ if (nlm_grace_period) return roundup(nlm_grace_period, nlm_timeout) * HZ; else return nlm_timeout * 5 * HZ; } static void grace_ender(struct work_struct *grace) { struct delayed_work *dwork = to_delayed_work(grace); struct lockd_net *ln = container_of(dwork, struct lockd_net, grace_period_end); locks_end_grace(&ln->lockd_manager); } static void set_grace_period(struct net *net) { unsigned long grace_period = get_lockd_grace_period(); struct lockd_net *ln = net_generic(net, lockd_net_id); locks_start_grace(net, &ln->lockd_manager); cancel_delayed_work_sync(&ln->grace_period_end); schedule_delayed_work(&ln->grace_period_end, grace_period); } /* * This is the lockd kernel thread */ static int lockd(void *vrqstp) { struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); svc_thread_init_status(rqstp, 0); /* try_to_freeze() is called from svc_recv() */ set_freezable(); dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. */ while (!svc_thread_should_stop(rqstp)) { /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; nlmsvc_retry_blocked(rqstp); svc_recv(rqstp); } if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); dprintk("lockd_down: service stopped\n"); svc_exit_thread(rqstp); return 0; } static int create_lockd_listener(struct svc_serv *serv, const char *name, struct net *net, const int family, const unsigned short port, const struct cred *cred) { struct svc_xprt *xprt; xprt = svc_find_xprt(serv, name, net, family, 0); if (xprt == NULL) return svc_xprt_create(serv, name, net, family, port, SVC_SOCK_DEFAULTS, cred); svc_xprt_put(xprt); return 0; } static int create_lockd_family(struct svc_serv *serv, struct net *net, const int family, const struct cred *cred) { int err; err = create_lockd_listener(serv, "udp", net, family, nlm_udpport, cred); if (err < 0) return err; return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport, cred); } /* * Ensure there are active UDP and TCP listeners for lockd. * * Even if we have only TCP NFS mounts and/or TCP NFSDs, some * local services (such as rpc.statd) still require UDP, and * some NFS servers do not yet support NLM over TCP. * * Returns zero if all listeners are available; otherwise a * negative errno value is returned. */ static int make_socks(struct svc_serv *serv, struct net *net, const struct cred *cred) { static int warned; int err; err = create_lockd_family(serv, net, PF_INET, cred); if (err < 0) goto out_err; err = create_lockd_family(serv, net, PF_INET6, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_err; warned = 0; return 0; out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); svc_xprt_destroy_all(serv, net); svc_rpcb_cleanup(serv, net); return err; } static int lockd_up_net(struct svc_serv *serv, struct net *net, const struct cred *cred) { struct lockd_net *ln = net_generic(net, lockd_net_id); int error; if (ln->nlmsvc_users++) return 0; error = svc_bind(serv, net); if (error) goto err_bind; error = make_socks(serv, net, cred); if (error < 0) goto err_bind; set_grace_period(net); dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum); return 0; err_bind: ln->nlmsvc_users--; return error; } static void lockd_down_net(struct svc_serv *serv, struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); svc_xprt_destroy_all(serv, net); svc_rpcb_cleanup(serv, net); } } else { pr_err("%s: no users! net=%x\n", __func__, net->ns.inum); BUG(); } } static int lockd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inetaddr_notifier = { .notifier_call = lockd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int lockd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inet6addr_notifier = { .notifier_call = lockd_inet6addr_event, }; #endif static int lockd_get(void) { struct svc_serv *serv; int error; if (nlmsvc_serv) { nlmsvc_users++; return 0; } /* * Sanity check: if there's no pid, * we should be the first user ... */ if (nlmsvc_users) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; } serv->sv_maxconn = nlm_max_connections; error = svc_set_num_threads(serv, NULL, 1); if (error < 0) { svc_destroy(&serv); return error; } nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); nlmsvc_users++; return 0; } static void lockd_put(void) { if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n")) return; if (--nlmsvc_users) return; unregister_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif svc_set_num_threads(nlmsvc_serv, NULL, 0); timer_delete_sync(&nlmsvc_retry); svc_destroy(&nlmsvc_serv); dprintk("lockd_down: service destroyed\n"); } /* * Bring up the lockd process if it's not already up. */ int lockd_up(struct net *net, const struct cred *cred) { int error; mutex_lock(&nlmsvc_mutex); error = lockd_get(); if (error) goto err; error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { lockd_put(); goto err; } err: mutex_unlock(&nlmsvc_mutex); return error; } EXPORT_SYMBOL_GPL(lockd_up); /* * Decrement the user count and bring down lockd if we're the last. */ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); lockd_down_net(nlmsvc_serv, net); lockd_put(); mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL_GPL(lockd_down); #ifdef CONFIG_SYSCTL /* * Sysctl parameters (same as module parameters, different interface). */ static struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_grace_period_min, .extra2 = (unsigned long *) &nlm_grace_period_max, }, { .procname = "nlm_timeout", .data = &nlm_timeout, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_timeout_min, .extra2 = (unsigned long *) &nlm_timeout_max, }, { .procname = "nlm_udpport", .data = &nlm_udpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nlm_tcpport", .data = &nlm_tcpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nsm_use_hostnames", .data = &nsm_use_hostnames, .maxlen = sizeof(bool), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", .data = &nsm_local_state, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; #endif /* CONFIG_SYSCTL */ /* * Module (and sysfs) parameters. */ #define param_set_min_max(name, type, which_strtol, min, max) \ static int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ char *endp; \ __typeof__(type) num = which_strtol(val, &endp, 0); \ if (endp == val || *endp || num < (min) || num > (max)) \ return -EINVAL; \ *((type *) kp->arg) = num; \ return 0; \ } static inline int is_callback(u32 proc) { return proc == NLMPROC_GRANTED || proc == NLMPROC_GRANTED_MSG || proc == NLMPROC_TEST_RES || proc == NLMPROC_LOCK_RES || proc == NLMPROC_CANCEL_RES || proc == NLMPROC_UNLOCK_RES || proc == NLMPROC_NSM_NOTIFY; } static enum svc_auth_status lockd_authenticate(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: rqstp->rq_auth_stat = rpc_auth_ok; if (rqstp->rq_proc == 0) return SVC_OK; if (is_callback(rqstp->rq_proc)) { /* Leave it to individual procedures to * call nlmsvc_lookup_host(rqstp) */ return SVC_OK; } return svc_set_client(rqstp); } rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } param_set_min_max(port, int, simple_strtol, 0, 65535) param_set_min_max(grace_period, unsigned long, simple_strtoul, nlm_grace_period_min, nlm_grace_period_max) param_set_min_max(timeout, unsigned long, simple_strtoul, nlm_timeout_min, nlm_timeout_max) MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION "."); MODULE_LICENSE("GPL"); module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong, &nlm_grace_period, 0644); module_param_call(nlm_timeout, param_set_timeout, param_get_ulong, &nlm_timeout, 0644); module_param_call(nlm_udpport, param_set_port, param_get_int, &nlm_udpport, 0644); module_param_call(nlm_tcpport, param_set_port, param_get_int, &nlm_tcpport, 0644); module_param(nsm_use_hostnames, bool, 0644); module_param(nlm_max_connections, uint, 0644); static int lockd_init_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); ln->lockd_manager.block_opens = false; INIT_LIST_HEAD(&ln->nsm_handles); return 0; } static void lockd_exit_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); WARN_ONCE(!list_empty(&ln->lockd_manager.list), "net %x %s: lockd_manager.list is not empty\n", net->ns.inum, __func__); WARN_ONCE(!list_empty(&ln->nsm_handles), "net %x %s: nsm_handles list is not empty\n", net->ns.inum, __func__); WARN_ONCE(delayed_work_pending(&ln->grace_period_end), "net %x %s: grace_period_end was not cancelled\n", net->ns.inum, __func__); } static struct pernet_operations lockd_net_ops = { .init = lockd_init_net, .exit = lockd_exit_net, .id = &lockd_net_id, .size = sizeof(struct lockd_net), }; /* * Initialising and terminating the module. */ static int __init init_nlm(void) { int err; #ifdef CONFIG_SYSCTL err = -ENOMEM; nlm_sysctl_table = register_sysctl("fs/nfs", nlm_sysctls); if (nlm_sysctl_table == NULL) goto err_sysctl; #endif err = register_pernet_subsys(&lockd_net_ops); if (err) goto err_pernet; err = lockd_create_procfs(); if (err) goto err_procfs; return 0; err_procfs: unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); err_sysctl: #endif return err; } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); #endif } module_init(init_nlm); module_exit(exit_nlm); /** * nlmsvc_dispatch - Process an NLM Request * @rqstp: incoming request * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ static int nlmsvc_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; *statp = procp->pc_func(rqstp); if (*statp == rpc_drop_reply) return 0; if (*statp != rpc_success) return 1; if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; return 1; out_decode_err: *statp = rpc_garbage_args; return 1; out_encode_err: *statp = rpc_system_err; return 1; } /* * Define NLM program and procedures */ static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]); static const struct svc_version nlmsvc_version1 = { .vs_vers = 1, .vs_nproc = 17, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version1_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]); static const struct svc_version nlmsvc_version3 = { .vs_vers = 3, .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #ifdef CONFIG_LOCKD_V4 static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]); static const struct svc_version nlmsvc_version4 = { .vs_vers = 4, .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4), .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, #ifdef CONFIG_LOCKD_V4 [4] = &nlmsvc_version4, #endif }; #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ .pg_nvers = NLM_NRVERS, /* number of entries in nlmsvc_version */ .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, };
3 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* SPDX-License-Identifier: GPL-2.0-only */ /* * VMware VMCI driver (vmciContext.h) * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #ifndef _VMCI_CONTEXT_H_ #define _VMCI_CONTEXT_H_ #include <linux/vmw_vmci_defs.h> #include <linux/atomic.h> #include <linux/kref.h> #include <linux/types.h> #include <linux/wait.h> #include "vmci_handle_array.h" #include "vmci_datagram.h" /* Used to determine what checkpoint state to get and set. */ enum { VMCI_NOTIFICATION_CPT_STATE = 1, VMCI_WELLKNOWN_CPT_STATE = 2, VMCI_DG_OUT_STATE = 3, VMCI_DG_IN_STATE = 4, VMCI_DG_IN_SIZE_STATE = 5, VMCI_DOORBELL_CPT_STATE = 6, }; /* Host specific struct used for signalling */ struct vmci_host { wait_queue_head_t wait_queue; }; struct vmci_handle_list { struct list_head node; struct vmci_handle handle; }; struct vmci_ctx { struct list_head list_item; /* For global VMCI list. */ u32 cid; struct kref kref; struct list_head datagram_queue; /* Head of per VM queue. */ u32 pending_datagrams; size_t datagram_queue_size; /* Size of datagram queue in bytes. */ /* * Version of the code that created * this context; e.g., VMX. */ int user_version; spinlock_t lock; /* Locks callQueue and handle_arrays. */ /* * queue_pairs attached to. The array of * handles for queue pairs is accessed * from the code for QP API, and there * it is protected by the QP lock. It * is also accessed from the context * clean up path, which does not * require a lock. VMCILock is not * used to protect the QP array field. */ struct vmci_handle_arr *queue_pair_array; /* Doorbells created by context. */ struct vmci_handle_arr *doorbell_array; /* Doorbells pending for context. */ struct vmci_handle_arr *pending_doorbell_array; /* Contexts current context is subscribing to. */ struct list_head notifier_list; unsigned int n_notifiers; struct vmci_host host_context; u32 priv_flags; const struct cred *cred; bool *notify; /* Notify flag pointer - hosted only. */ struct page *notify_page; /* Page backing the notify UVA. */ }; /* VMCINotifyAddRemoveInfo: Used to add/remove remote context notifications. */ struct vmci_ctx_info { u32 remote_cid; int result; }; /* VMCICptBufInfo: Used to set/get current context's checkpoint state. */ struct vmci_ctx_chkpt_buf_info { u64 cpt_buf; u32 cpt_type; u32 buf_size; s32 result; u32 _pad; }; /* * VMCINotificationReceiveInfo: Used to recieve pending notifications * for doorbells and queue pairs. */ struct vmci_ctx_notify_recv_info { u64 db_handle_buf_uva; u64 db_handle_buf_size; u64 qp_handle_buf_uva; u64 qp_handle_buf_size; s32 result; u32 _pad; }; /* * Utilility function that checks whether two entities are allowed * to interact. If one of them is restricted, the other one must * be trusted. */ static inline bool vmci_deny_interaction(u32 part_one, u32 part_two) { return ((part_one & VMCI_PRIVILEGE_FLAG_RESTRICTED) && !(part_two & VMCI_PRIVILEGE_FLAG_TRUSTED)) || ((part_two & VMCI_PRIVILEGE_FLAG_RESTRICTED) && !(part_one & VMCI_PRIVILEGE_FLAG_TRUSTED)); } struct vmci_ctx *vmci_ctx_create(u32 cid, u32 flags, uintptr_t event_hnd, int version, const struct cred *cred); void vmci_ctx_destroy(struct vmci_ctx *context); bool vmci_ctx_supports_host_qp(struct vmci_ctx *context); int vmci_ctx_enqueue_datagram(u32 cid, struct vmci_datagram *dg); int vmci_ctx_dequeue_datagram(struct vmci_ctx *context, size_t *max_size, struct vmci_datagram **dg); int vmci_ctx_pending_datagrams(u32 cid, u32 *pending); struct vmci_ctx *vmci_ctx_get(u32 cid); void vmci_ctx_put(struct vmci_ctx *context); bool vmci_ctx_exists(u32 cid); int vmci_ctx_add_notification(u32 context_id, u32 remote_cid); int vmci_ctx_remove_notification(u32 context_id, u32 remote_cid); int vmci_ctx_get_chkpt_state(u32 context_id, u32 cpt_type, u32 *num_cids, void **cpt_buf_ptr); int vmci_ctx_set_chkpt_state(u32 context_id, u32 cpt_type, u32 num_cids, void *cpt_buf); int vmci_ctx_qp_create(struct vmci_ctx *context, struct vmci_handle handle); int vmci_ctx_qp_destroy(struct vmci_ctx *context, struct vmci_handle handle); bool vmci_ctx_qp_exists(struct vmci_ctx *context, struct vmci_handle handle); void vmci_ctx_check_signal_notify(struct vmci_ctx *context); void vmci_ctx_unset_notify(struct vmci_ctx *context); int vmci_ctx_dbell_create(u32 context_id, struct vmci_handle handle); int vmci_ctx_dbell_destroy(u32 context_id, struct vmci_handle handle); int vmci_ctx_dbell_destroy_all(u32 context_id); int vmci_ctx_notify_dbell(u32 cid, struct vmci_handle handle, u32 src_priv_flags); int vmci_ctx_rcv_notifications_get(u32 context_id, struct vmci_handle_arr **db_handle_array, struct vmci_handle_arr **qp_handle_array); void vmci_ctx_rcv_notifications_release(u32 context_id, struct vmci_handle_arr *db_handle_array, struct vmci_handle_arr *qp_handle_array, bool success); static inline u32 vmci_ctx_get_id(struct vmci_ctx *context) { if (!context) return VMCI_INVALID_ID; return context->cid; } #endif /* _VMCI_CONTEXT_H_ */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 /* SPDX-License-Identifier: GPL-2.0 */ /* * Internals of the DMA direct mapping implementation. Only for use by the * DMA mapping code and IOMMU drivers. */ #ifndef _LINUX_DMA_DIRECT_H #define _LINUX_DMA_DIRECT_H 1 #include <linux/dma-mapping.h> #include <linux/dma-map-ops.h> #include <linux/memblock.h> /* for min_low_pfn */ #include <linux/mem_encrypt.h> #include <linux/swiotlb.h> extern u64 zone_dma_limit; /* * Record the mapping of CPU physical to DMA addresses for a given region. */ struct bus_dma_region { phys_addr_t cpu_start; dma_addr_t dma_start; u64 size; }; static inline dma_addr_t translate_phys_to_dma(struct device *dev, phys_addr_t paddr) { const struct bus_dma_region *m; for (m = dev->dma_range_map; m->size; m++) { u64 offset = paddr - m->cpu_start; if (paddr >= m->cpu_start && offset < m->size) return m->dma_start + offset; } /* make sure dma_capable fails when no translation is available */ return DMA_MAPPING_ERROR; } static inline phys_addr_t translate_dma_to_phys(struct device *dev, dma_addr_t dma_addr) { const struct bus_dma_region *m; for (m = dev->dma_range_map; m->size; m++) { u64 offset = dma_addr - m->dma_start; if (dma_addr >= m->dma_start && offset < m->size) return m->cpu_start + offset; } return (phys_addr_t)-1; } static inline dma_addr_t dma_range_map_min(const struct bus_dma_region *map) { dma_addr_t ret = (dma_addr_t)U64_MAX; for (; map->size; map++) ret = min(ret, map->dma_start); return ret; } static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map) { dma_addr_t ret = 0; for (; map->size; map++) ret = max(ret, map->dma_start + map->size - 1); return ret; } #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA #include <asm/dma-direct.h> #ifndef phys_to_dma_unencrypted #define phys_to_dma_unencrypted phys_to_dma #endif #else static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, phys_addr_t paddr) { if (dev->dma_range_map) return translate_phys_to_dma(dev, paddr); return paddr; } /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. * phys_to_dma_unencrypted is for use on special unencrypted memory like swiotlb * buffers. */ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { return __sme_set(phys_to_dma_unencrypted(dev, paddr)); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else paddr = dma_addr; return __sme_clr(paddr); } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ #ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED bool force_dma_unencrypted(struct device *dev); #else static inline bool force_dma_unencrypted(struct device *dev) { return false; } #endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size, bool is_ram) { dma_addr_t end = addr + size - 1; if (addr == DMA_MAPPING_ERROR) return false; if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) && min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn))) return false; return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_limit); } u64 dma_direct_get_required_mask(struct device *dev); void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); struct page *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir); int dma_direct_supported(struct device *dev, u64 mask); dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs); #endif /* _LINUX_DMA_DIRECT_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2023 Western Digital Corporation or its affiliates. */ #include <linux/btrfs_tree.h> #include "ctree.h" #include "fs.h" #include "accessors.h" #include "transaction.h" #include "disk-io.h" #include "raid-stripe-tree.h" #include "volumes.h" #include "print-tree.h" static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *oldkey, u64 newlen, u64 frontpad) { struct btrfs_stripe_extent *extent; struct extent_buffer *leaf; int slot; size_t item_size; struct btrfs_key newkey = { .objectid = oldkey->objectid + frontpad, .type = BTRFS_RAID_STRIPE_KEY, .offset = newlen, }; ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); leaf = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { struct btrfs_raid_stride *stride = &extent->strides[i]; u64 phys; phys = btrfs_raid_stride_physical(leaf, stride); btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); } btrfs_set_item_key_safe(trans, path, &newkey); } int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *stripe_root = fs_info->stripe_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; u64 found_start; u64 found_end; u64 end = start + length; int slot; int ret; if (!stripe_root) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; while (1) { key.objectid = start; key.type = BTRFS_RAID_STRIPE_KEY; key.offset = 0; ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); if (ret < 0) break; if (path->slots[0] == btrfs_header_nritems(path->nodes[0])) path->slots[0]--; leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); found_start = key.objectid; found_end = found_start + key.offset; ret = 0; if (key.type != BTRFS_RAID_STRIPE_KEY) break; /* That stripe ends before we start, we're done. */ if (found_end <= start) break; trace_btrfs_raid_extent_delete(fs_info, start, end, found_start, found_end); /* * The stripe extent starts before the range we want to delete: * * |--- RAID Stripe Extent ---| * |--- keep ---|--- drop ---| * * This means we have to duplicate the tree item, truncate the * length to the new size and then re-insert the item. */ if (found_start < start) { u64 diff = start - found_start; btrfs_partially_delete_raid_extent(trans, path, &key, diff, 0); break; } /* * The stripe extent ends after the range we want to delete: * * |--- RAID Stripe Extent ---| * |--- drop ---|--- keep ---| * * This means we have to duplicate the tree item, truncate the * length to the new size and then re-insert the item. */ if (found_end > end) { u64 diff = found_end - end; btrfs_partially_delete_raid_extent(trans, path, &key, diff, diff); break; } ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; start += key.offset; length -= key.offset; if (length == 0) break; btrfs_release_path(path); } btrfs_free_path(path); return ret; } static int update_raid_extent_item(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_stripe_extent *stripe_extent, const size_t item_size) { struct btrfs_path *path; struct extent_buffer *leaf; int ret; int slot; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path, 0, 1); if (ret) return (ret == 1 ? ret : -EINVAL); leaf = path->nodes[0]; slot = path->slots[0]; write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return ret; } EXPORT_FOR_TESTS int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key stripe_key; struct btrfs_root *stripe_root = fs_info->stripe_root; const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); struct btrfs_stripe_extent *stripe_extent; const size_t item_size = struct_size(stripe_extent, strides, num_stripes); int ret; stripe_extent = kzalloc(item_size, GFP_NOFS); if (!stripe_extent) { btrfs_abort_transaction(trans, -ENOMEM); btrfs_end_transaction(trans); return -ENOMEM; } trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size, num_stripes); for (int i = 0; i < num_stripes; i++) { u64 devid = bioc->stripes[i].dev->devid; u64 physical = bioc->stripes[i].physical; u64 length = bioc->stripes[i].length; struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; if (length == 0) length = bioc->size; btrfs_set_stack_raid_stride_devid(raid_stride, devid); btrfs_set_stack_raid_stride_physical(raid_stride, physical); } stripe_key.objectid = bioc->logical; stripe_key.type = BTRFS_RAID_STRIPE_KEY; stripe_key.offset = bioc->size; ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); if (ret == -EEXIST) ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, item_size); if (ret) btrfs_abort_transaction(trans, ret); kfree(stripe_extent); return ret; } int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent) { struct btrfs_io_context *bioc; int ret; if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE)) return 0; list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) { ret = btrfs_insert_one_raid_extent(trans, bioc); if (ret) return ret; } while (!list_empty(&ordered_extent->bioc_list)) { bioc = list_first_entry(&ordered_extent->bioc_list, typeof(*bioc), rst_ordered_entry); list_del(&bioc->rst_ordered_entry); btrfs_put_bioc(bioc); } return 0; } int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, u64 logical, u64 *length, u64 map_type, u32 stripe_index, struct btrfs_io_stripe *stripe) { struct btrfs_root *stripe_root = fs_info->stripe_root; struct btrfs_stripe_extent *stripe_extent; struct btrfs_key stripe_key; struct btrfs_key found_key; struct btrfs_path *path; struct extent_buffer *leaf; const u64 end = logical + *length; int num_stripes; u64 offset; u64 found_logical; u64 found_length; u64 found_end; int slot; int ret; stripe_key.objectid = logical; stripe_key.type = BTRFS_RAID_STRIPE_KEY; stripe_key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (stripe->rst_search_commit_root) { path->skip_locking = 1; path->search_commit_root = 1; } ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); if (ret < 0) goto free_path; if (ret) { if (path->slots[0] != 0) path->slots[0]--; } while (1) { leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); found_logical = found_key.objectid; found_length = found_key.offset; found_end = found_logical + found_length; if (found_logical > end) { ret = -ENODATA; goto out; } if (in_range(logical, found_logical, found_length)) break; ret = btrfs_next_item(stripe_root, path); if (ret) goto out; } offset = logical - found_logical; /* * If we have a logically contiguous, but physically non-continuous * range, we need to split the bio. Record the length after which we * must split the bio. */ if (end > found_end) *length -= end - found_end; num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); for (int i = 0; i < num_stripes; i++) { struct btrfs_raid_stride *stride = &stripe_extent->strides[i]; u64 devid = btrfs_raid_stride_devid(leaf, stride); u64 physical = btrfs_raid_stride_physical(leaf, stride); if (devid != stripe->dev->devid) continue; if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i) continue; stripe->physical = physical + offset; trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, stripe->physical, devid); ret = 0; goto free_path; } /* If we're here, we haven't found the requested devid in the stripe. */ ret = -ENODATA; out: if (ret > 0) ret = -ENODATA; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); } free_path: btrfs_free_path(path); return ret; }
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * ebtables * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * ebtables.c,v 2.0, April, 2002 * * This code is strongly inspired by the iptables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling */ #ifndef _UAPI__LINUX_BRIDGE_EFF_H #define _UAPI__LINUX_BRIDGE_EFF_H #include <linux/types.h> #include <linux/if.h> #include <linux/netfilter_bridge.h> #define EBT_TABLE_MAXNAMELEN 32 #define EBT_CHAIN_MAXNAMELEN EBT_TABLE_MAXNAMELEN #define EBT_FUNCTION_MAXNAMELEN EBT_TABLE_MAXNAMELEN #define EBT_EXTENSION_MAXNAMELEN 31 /* verdicts >0 are "branches" */ #define EBT_ACCEPT -1 #define EBT_DROP -2 #define EBT_CONTINUE -3 #define EBT_RETURN -4 #define NUM_STANDARD_TARGETS 4 /* ebtables target modules store the verdict inside an int. We can * reclaim a part of this int for backwards compatible extensions. * The 4 lsb are more than enough to store the verdict. */ #define EBT_VERDICT_BITS 0x0000000F struct xt_match; struct xt_target; struct ebt_counter { __u64 pcnt; __u64 bcnt; }; struct ebt_replace { char name[EBT_TABLE_MAXNAMELEN]; unsigned int valid_hooks; /* nr of rules in the table */ unsigned int nentries; /* total size of the entries */ unsigned int entries_size; /* start of the chains */ struct ebt_entries __user *hook_entry[NF_BR_NUMHOOKS]; /* nr of counters userspace expects back */ unsigned int num_counters; /* where the kernel will put the old counters */ struct ebt_counter __user *counters; char __user *entries; }; struct ebt_replace_kernel { char name[EBT_TABLE_MAXNAMELEN]; unsigned int valid_hooks; /* nr of rules in the table */ unsigned int nentries; /* total size of the entries */ unsigned int entries_size; /* start of the chains */ struct ebt_entries *hook_entry[NF_BR_NUMHOOKS]; /* nr of counters userspace expects back */ unsigned int num_counters; /* where the kernel will put the old counters */ struct ebt_counter *counters; char *entries; }; struct ebt_entries { /* this field is always set to zero * See EBT_ENTRY_OR_ENTRIES. * Must be same size as ebt_entry.bitmask */ unsigned int distinguisher; /* the chain name */ char name[EBT_CHAIN_MAXNAMELEN]; /* counter offset for this chain */ unsigned int counter_offset; /* one standard (accept, drop, return) per hook */ int policy; /* nr. of entries */ unsigned int nentries; /* entry list */ char data[] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; /* used for the bitmask of struct ebt_entry */ /* This is a hack to make a difference between an ebt_entry struct and an * ebt_entries struct when traversing the entries from start to end. * Using this simplifies the code a lot, while still being able to use * ebt_entries. * Contrary, iptables doesn't use something like ebt_entries and therefore uses * different techniques for naming the policy and such. So, iptables doesn't * need a hack like this. */ #define EBT_ENTRY_OR_ENTRIES 0x01 /* these are the normal masks */ #define EBT_NOPROTO 0x02 #define EBT_802_3 0x04 #define EBT_SOURCEMAC 0x08 #define EBT_DESTMAC 0x10 #define EBT_F_MASK (EBT_NOPROTO | EBT_802_3 | EBT_SOURCEMAC | EBT_DESTMAC \ | EBT_ENTRY_OR_ENTRIES) #define EBT_IPROTO 0x01 #define EBT_IIN 0x02 #define EBT_IOUT 0x04 #define EBT_ISOURCE 0x8 #define EBT_IDEST 0x10 #define EBT_ILOGICALIN 0x20 #define EBT_ILOGICALOUT 0x40 #define EBT_INV_MASK (EBT_IPROTO | EBT_IIN | EBT_IOUT | EBT_ILOGICALIN \ | EBT_ILOGICALOUT | EBT_ISOURCE | EBT_IDEST) struct ebt_entry_match { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; __u8 revision; }; struct xt_match *match; } u; /* size of data */ unsigned int match_size; unsigned char data[] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; struct ebt_entry_watcher { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; __u8 revision; }; struct xt_target *watcher; } u; /* size of data */ unsigned int watcher_size; unsigned char data[] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; struct ebt_entry_target { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; __u8 revision; }; struct xt_target *target; } u; /* size of data */ unsigned int target_size; unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; #define EBT_STANDARD_TARGET "standard" struct ebt_standard_target { struct ebt_entry_target target; int verdict; }; /* one entry */ struct ebt_entry { /* this needs to be the first field */ unsigned int bitmask; unsigned int invflags; __be16 ethproto; /* the physical in-dev */ char in[IFNAMSIZ]; /* the logical in-dev */ char logical_in[IFNAMSIZ]; /* the physical out-dev */ char out[IFNAMSIZ]; /* the logical out-dev */ char logical_out[IFNAMSIZ]; unsigned char sourcemac[ETH_ALEN]; unsigned char sourcemsk[ETH_ALEN]; unsigned char destmac[ETH_ALEN]; unsigned char destmsk[ETH_ALEN]; __struct_group(/* no tag */, offsets, /* no attrs */, /* sizeof ebt_entry + matches */ unsigned int watchers_offset; /* sizeof ebt_entry + matches + watchers */ unsigned int target_offset; /* sizeof ebt_entry + matches + watchers + target */ unsigned int next_offset; ); unsigned char elems[] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; static __inline__ struct ebt_entry_target * ebt_get_target(struct ebt_entry *e) { return (struct ebt_entry_target *)((char *)e + e->target_offset); } /* {g,s}etsockopt numbers */ #define EBT_BASE_CTL 128 #define EBT_SO_SET_ENTRIES (EBT_BASE_CTL) #define EBT_SO_SET_COUNTERS (EBT_SO_SET_ENTRIES+1) #define EBT_SO_SET_MAX (EBT_SO_SET_COUNTERS+1) #define EBT_SO_GET_INFO (EBT_BASE_CTL) #define EBT_SO_GET_ENTRIES (EBT_SO_GET_INFO+1) #define EBT_SO_GET_INIT_INFO (EBT_SO_GET_ENTRIES+1) #define EBT_SO_GET_INIT_ENTRIES (EBT_SO_GET_INIT_INFO+1) #define EBT_SO_GET_MAX (EBT_SO_GET_INIT_ENTRIES+1) /* blatently stolen from ip_tables.h * fn returns 0 to continue iteration */ #define EBT_MATCH_ITERATE(e, fn, args...) \ ({ \ unsigned int __i; \ int __ret = 0; \ struct ebt_entry_match *__match; \ \ for (__i = sizeof(struct ebt_entry); \ __i < (e)->watchers_offset; \ __i += __match->match_size + \ sizeof(struct ebt_entry_match)) { \ __match = (void *)(e) + __i; \ \ __ret = fn(__match , ## args); \ if (__ret != 0) \ break; \ } \ if (__ret == 0) { \ if (__i != (e)->watchers_offset) \ __ret = -EINVAL; \ } \ __ret; \ }) #define EBT_WATCHER_ITERATE(e, fn, args...) \ ({ \ unsigned int __i; \ int __ret = 0; \ struct ebt_entry_watcher *__watcher; \ \ for (__i = e->watchers_offset; \ __i < (e)->target_offset; \ __i += __watcher->watcher_size + \ sizeof(struct ebt_entry_watcher)) { \ __watcher = (void *)(e) + __i; \ \ __ret = fn(__watcher , ## args); \ if (__ret != 0) \ break; \ } \ if (__ret == 0) { \ if (__i != (e)->target_offset) \ __ret = -EINVAL; \ } \ __ret; \ }) #define EBT_ENTRY_ITERATE(entries, size, fn, args...) \ ({ \ unsigned int __i; \ int __ret = 0; \ struct ebt_entry *__entry; \ \ for (__i = 0; __i < (size);) { \ __entry = (void *)(entries) + __i; \ __ret = fn(__entry , ## args); \ if (__ret != 0) \ break; \ if (__entry->bitmask != 0) \ __i += __entry->next_offset; \ else \ __i += sizeof(struct ebt_entries); \ } \ if (__ret == 0) { \ if (__i != (size)) \ __ret = -EINVAL; \ } \ __ret; \ }) #endif /* _UAPI__LINUX_BRIDGE_EFF_H */
138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/limits.h> #include <linux/math.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/reciprocal_div.h> /* * For a description of the algorithm please have a look at * include/linux/reciprocal_div.h */ struct reciprocal_value reciprocal_value(u32 d) { struct reciprocal_value R; u64 m; int l; l = fls(d - 1); m = ((1ULL << 32) * ((1ULL << l) - d)); do_div(m, d); ++m; R.m = (u32)m; R.sh1 = min(l, 1); R.sh2 = max(l - 1, 0); return R; } EXPORT_SYMBOL(reciprocal_value); struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) { struct reciprocal_value_adv R; u32 l, post_shift; u64 mhigh, mlow; /* ceil(log2(d)) */ l = fls(d - 1); /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to * be handled before calling "reciprocal_value_adv", please see the * comment at include/linux/reciprocal_div.h. */ WARN(l == 32, "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor", d, __func__); post_shift = l; mlow = 1ULL << (32 + l); do_div(mlow, d); mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec)); do_div(mhigh, d); for (; post_shift > 0; post_shift--) { u64 lo = mlow >> 1, hi = mhigh >> 1; if (lo >= hi) break; mlow = lo; mhigh = hi; } R.m = (u32)mhigh; R.sh = post_shift; R.exp = l; R.is_wide_m = mhigh > U32_MAX; return R; } EXPORT_SYMBOL(reciprocal_value_adv);
1 1 1 20 7 13 13 13 5 3 2 2 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 // SPDX-License-Identifier: GPL-2.0 /* * Support for async notification of waitid */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "cancel.h" #include "waitid.h" #include "../kernel/exit.h" static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) struct io_waitid { struct file *file; int which; pid_t upid; int options; atomic_t refs; struct wait_queue_head *head; struct siginfo __user *infop; struct waitid_info info; }; static void io_waitid_free(struct io_kiocb *req) { struct io_waitid_async *iwa = req->async_data; put_pid(iwa->wo.wo_pid); kfree(req->async_data); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } #ifdef CONFIG_COMPAT static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; bool ret; infop = (struct compat_siginfo __user *) iw->infop; if (!user_write_access_begin(infop, sizeof(*infop))) return false; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &infop->si_code, Efault); unsafe_put_user(iw->info.pid, &infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &infop->si_uid, Efault); unsafe_put_user(iw->info.status, &infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } #endif static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); bool ret; if (!iw->infop) return true; #ifdef CONFIG_COMPAT if (req->ctx->compat) return io_waitid_compat_copy_si(iw, signo); #endif if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; unsafe_put_user(signo, &iw->infop->si_signo, Efault); unsafe_put_user(0, &iw->infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault); unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault); unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } static int io_waitid_finish(struct io_kiocb *req, int ret) { int signo = 0; if (ret > 0) { signo = SIGCHLD; ret = 0; } if (!io_waitid_copy_si(req, signo)) ret = -EFAULT; io_waitid_free(req); return ret; } static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); lockdep_assert_held(&req->ctx->uring_lock); hlist_del_init(&req->hash_node); ret = io_waitid_finish(req, ret); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); io_req_task_complete(req, &ts); } static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; /* * Mark us canceled regardless of ownership. This will prevent a * potential retry from a spurious wakeup. */ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs); /* claim ownership */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return false; spin_lock_irq(&iw->head->lock); list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); return true; } int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { struct hlist_node *tmp; struct io_kiocb *req; int nr = 0; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) return -ENOENT; io_ring_submit_lock(ctx, issue_flags); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { if (req->cqe.user_data != cd->data && !(cd->flags & IORING_ASYNC_CANCEL_ANY)) continue; if (__io_waitid_cancel(ctx, req)) nr++; if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) break; } io_ring_submit_unlock(ctx, issue_flags); if (nr) return nr; return -ENOENT; } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; bool found = false; lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { if (!io_match_task_safe(req, tctx, cancel_all)) continue; hlist_del_init(&req->hash_node); __io_waitid_cancel(ctx, req); found = true; } return found; } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; if (!atomic_sub_return(1, &iw->refs)) return false; /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); remove_wait_queue(iw->head, &iwa->wo.child_wait); return true; } static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) { struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; io_tw_lock(ctx, ts); ret = __do_wait(&iwa->wo); /* * If we get -ERESTARTSYS here, we need to re-arm and check again * to ensure we get another callback. If the retry works, then we can * just remove ourselves from the waitqueue again and finish the * request. */ if (unlikely(ret == -ERESTARTSYS)) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); /* Don't retry if cancel found it meanwhile */ ret = -ECANCELED; if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) { iw->head = &current->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* retry armed, drop our ref */ io_waitid_drop_issue_ref(req); return; } remove_wait_queue(iw->head, &iwa->wo.child_wait); } } io_waitid_complete(req, ret); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo); struct io_kiocb *req = iwa->req; struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct task_struct *p = key; if (!pid_child_should_wake(wo, p)) return 0; /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return 1; req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); list_del_init(&wait->entry); return 1; } int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); return 0; } int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_ring_ctx *ctx = req->ctx; struct io_waitid_async *iwa; int ret; if (io_alloc_async_data(req)) return -ENOMEM; iwa = req->async_data; iwa->req = req; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) goto done; /* * Mark the request as busy upfront, in case we're racing with the * wakeup. If we are, then we'll notice when we drop this initial * reference again after arming. */ atomic_set(&iw->refs, 1); /* * Cancel must hold the ctx lock, so there's no risk of cancelation * finding us until a) we remain on the list, and b) the lock is * dropped. We only need to worry about racing with the wakeup * callback. */ io_ring_submit_lock(ctx, issue_flags); hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); iwa->wo.child_wait.private = req->tctx->task; iw->head = &current->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* * Nobody else grabbed a reference, it'll complete when we get * a waitqueue callback, or if someone cancels it. */ if (!io_waitid_drop_issue_ref(req)) { io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } hlist_del_init(&req->hash_node); remove_wait_queue(iw->head, &iwa->wo.child_wait); ret = io_waitid_finish(req, ret); io_ring_submit_unlock(ctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
360 360 360 360 361 361 356 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_MMU_INTERNAL_H #define __KVM_X86_MMU_INTERNAL_H #include <linux/types.h> #include <linux/kvm_host.h> #include <asm/kvm_host.h> #ifdef CONFIG_KVM_PROVE_MMU #define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x) #else #define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x) #endif /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ #define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12) #define __PT_LEVEL_SHIFT(level, bits_per_level) \ (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) #define __PT_INDEX(address, level, bits_per_level) \ (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1)) #define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \ ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) #define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \ ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) #define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level)) /* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE, * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use * '0' instead of INVALID_PAGE to indicate an invalid PAE root. */ #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) static inline hpa_t kvm_mmu_get_dummy_root(void) { return my_zero_pfn(0) << PAGE_SHIFT; } static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) { return is_zero_pfn(shadow_page >> PAGE_SHIFT); } typedef u64 __rcu *tdp_ptep_t; struct kvm_mmu_page { /* * Note, "link" through "spt" fit in a single 64 byte cache line on * 64-bit kernels, keep it that way unless there's a reason not to. */ struct list_head link; struct hlist_node hash_link; bool tdp_mmu_page; bool unsync; union { u8 mmu_valid_gen; /* Only accessed under slots_lock. */ bool tdp_mmu_scheduled_root_to_zap; }; /* * The shadow page can't be replaced by an equivalent huge page * because it is being used to map an executable page in the guest * and the NX huge page mitigation is enabled. */ bool nx_huge_page_disallowed; /* * The following two entries are used to key the shadow page in the * hash table. */ union kvm_mmu_page_role role; gfn_t gfn; u64 *spt; /* * Stores the result of the guest translation being shadowed by each * SPTE. KVM shadows two types of guest translations: nGPA -> GPA * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both * cases the result of the translation is a GPA and a set of access * constraints. * * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed * access permissions are stored in the lower bits. Note, for * convenience and uniformity across guests, the access permissions are * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. */ u64 *shadowed_translation; /* Currently serving as active root */ union { int root_count; refcount_t tdp_mmu_root_count; }; unsigned int unsync_children; union { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ tdp_ptep_t ptep; }; DECLARE_BITMAP(unsync_child_bitmap, 512); /* * Tracks shadow pages that, if zapped, would allow KVM to create an NX * huge page. A shadow page will have nx_huge_page_disallowed set but * not be on the list if a huge page is disallowed for other reasons, * e.g. because KVM is shadowing a PTE at the same gfn, the memslot * isn't properly aligned, etc... */ struct list_head possible_nx_huge_page_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an * update is in progress; see the comments in __get_spte_lockless(). */ int clear_spte_count; #endif /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; #ifdef CONFIG_X86_64 /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif }; extern struct kmem_cache *mmu_page_header_cache; static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) { return role.smm ? 1 : 0; } static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) { return kvm_mmu_role_as_id(sp->role); } static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty * log would come from L2 rather than L1. Therefore, we need to rely * on write protection to record dirty pages, which bypasses PML, since * writes now result in a vmexit. Note, the check on CPU dirty logging * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) { return gfn & -KVM_PAGES_PER_HPAGE(level); } int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool synchronizing, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); /* Flush the given page (huge or not) of guest memory. */ static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) { kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level), KVM_PAGES_PER_HPAGE(level)); } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; const u64 error_code; const bool prefetch; /* Derived from error_code. */ const bool exec; const bool write; const bool present; const bool rsvd; const bool user; /* Derived from mmu and global state. */ const bool is_tdp; const bool is_private; const bool nx_huge_page_workaround_enabled; /* * Whether a >4KB mapping can be created or is forbidden due to NX * hugepages. */ bool huge_page_disallowed; /* * Maximum page size that can be created for this fault; input to * FNAME(fetch), direct_map() and kvm_tdp_mmu_map(). */ u8 max_level; /* * Page size that can be created based on the max_level and the * page size used by the host mapping. */ u8 req_level; /* * Page size that will be created based on the req_level and * huge_page_disallowed. */ u8 goal_level; /* Shifted addr, or result of guest page table walk if addr is a gva. */ gfn_t gfn; /* The memslot containing gfn. May be NULL. */ struct kvm_memory_slot *slot; /* Outputs of kvm_mmu_faultin_pfn(). */ unsigned long mmu_seq; kvm_pfn_t pfn; struct page *refcounted_page; bool map_writable; /* * Indicates the guest is trying to write a gfn that contains one or * more of the PTEs used to translate the write itself, i.e. the access * is changing its own translation in the guest page tables. */ bool write_fault_to_shadow_pgtable; }; int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); /* * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(), * and of course kvm_mmu_do_page_fault(). * * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the * gfn and retry, or emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. * * Any names added to this enum should be exported to userspace for use in * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h * * Note, all values must be greater than or equal to zero so as not to encroach * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which * will allow for efficient machine code when checking for CONTINUE, e.g. * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero. */ enum { RET_PF_CONTINUE = 0, RET_PF_RETRY, RET_PF_EMULATE, RET_PF_WRITE_PROTECTED, RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, }; static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, PAGE_SIZE, fault->write, fault->exec, fault->is_private); } static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err, bool prefetch, int *emulation_type, u8 *level) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, .error_code = err, .exec = err & PFERR_FETCH_MASK, .write = err & PFERR_WRITE_MASK, .present = err & PFERR_PRESENT_MASK, .rsvd = err & PFERR_RSVD_MASK, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, .is_private = err & PFERR_PRIVATE_ACCESS, .pfn = KVM_PFN_ERR_FAULT, }; int r; if (vcpu->arch.mmu->root_role.direct) { fault.gfn = fault.addr >> PAGE_SHIFT; fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); } if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else r = vcpu->arch.mmu->page_fault(vcpu, &fault); /* * Not sure what's happening, but punt to userspace and hope that * they can fix it by changing memory to shared, or they can * provide a better error. */ if (r == RET_PF_EMULATE && fault.is_private) { pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); return -EFAULT; } if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; if (level) *level = fault.goal_level; return r; } int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */
2 6 6 2 4 2 2 11 4 4 4 9 9 9 2 7 7 7 7 7 6 2 6 9 9 16 13 2 2 14 20 20 2 1 2 1 15 16 1 1 9 9 9 9 8 8 1 1 2 2 7 7 7 2 2 2 2 28 1 12 3 1 11 5 6 4 4 4 4 6 6 5 4 9 9 1 1 1 1 1 3 1 3 3 4 2 2 4 7 9 429 431 430 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 /* * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bug.h> #include <linux/kdev_t.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/proc_ns.h> #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> #include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { const struct bpf_prog_offload_ops *ops; struct list_head netdevs; void *priv; }; struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; struct bpf_offload_dev *offdev; /* NULL when bound-only */ struct list_head progs; struct list_head maps; struct list_head offdev_netdevs; }; static const struct rhashtable_params offdevs_params = { .nelem_hint = 4, .key_len = sizeof(struct net_device *), .key_offset = offsetof(struct bpf_offload_netdev, netdev), .head_offset = offsetof(struct bpf_offload_netdev, l), .automatic_shrinking = true, }; static struct rhashtable offdevs; static int bpf_dev_offload_check(struct net_device *netdev) { if (!netdev) return -EINVAL; if (!netdev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; return 0; } static struct bpf_offload_netdev * bpf_offload_find_netdev(struct net_device *netdev) { lockdep_assert_held(&bpf_devs_lock); return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); } static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { struct bpf_offload_netdev *ondev; int err; ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); if (!ondev) return -ENOMEM; ondev->netdev = netdev; ondev->offdev = offdev; INIT_LIST_HEAD(&ondev->progs); INIT_LIST_HEAD(&ondev->maps); err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); if (err) { netdev_warn(netdev, "failed to register for BPF offload\n"); goto err_free; } if (offdev) list_add(&ondev->offdev_netdevs, &offdev->netdevs); return 0; err_free: kfree(ondev); return err; } static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; if (offload->dev_state) offload->offdev->ops->destroy(prog); list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; } static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, enum bpf_netdev_command cmd) { struct netdev_bpf data = {}; struct net_device *netdev; ASSERT_RTNL(); data.command = cmd; data.offmap = offmap; /* Caller must make sure netdev is valid */ netdev = offmap->netdev; return netdev->netdev_ops->ndo_bpf(netdev, &data); } static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) { WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ bpf_map_free_id(&offmap->map); list_del_init(&offmap->offloads); offmap->netdev = NULL; } static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { struct bpf_offload_netdev *ondev, *altdev = NULL; struct bpf_offloaded_map *offmap, *mtmp; struct bpf_prog_offload *offload, *ptmp; ASSERT_RTNL(); ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); if (WARN_ON(!ondev)) return; WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); /* Try to move the objects to another netdev of the device */ if (offdev) { list_del(&ondev->offdev_netdevs); altdev = list_first_entry_or_null(&offdev->netdevs, struct bpf_offload_netdev, offdev_netdevs); } if (altdev) { list_for_each_entry(offload, &ondev->progs, offloads) offload->netdev = altdev->netdev; list_splice_init(&ondev->progs, &altdev->progs); list_for_each_entry(offmap, &ondev->maps, offloads) offmap->netdev = altdev->netdev; list_splice_init(&ondev->maps, &altdev->maps); } else { list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) __bpf_prog_offload_destroy(offload->prog); list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) __bpf_map_offload_destroy(offmap); } WARN_ON(!list_empty(&ondev->progs)); WARN_ON(!list_empty(&ondev->maps)); kfree(ondev); } static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev) { struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; offload = kzalloc(sizeof(*offload), GFP_USER); if (!offload) return -ENOMEM; offload->prog = prog; offload->netdev = netdev; ondev = bpf_offload_find_netdev(offload->netdev); /* When program is offloaded require presence of "true" * bpf_offload_netdev, avoid the one created for !ondev case below. */ if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) { err = -EINVAL; goto err_free; } if (!ondev) { /* When only binding to the device, explicitly * create an entry in the hashtable. */ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev); if (err) goto err_free; ondev = bpf_offload_find_netdev(offload->netdev); } offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); return 0; err_free: kfree(offload); return err; } int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) { struct net_device *netdev; int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) return -EINVAL; if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS)) return -EINVAL; /* Frags are allowed only if program is dev-bound-only, but not * if it is requesting bpf offload. */ if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS && !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) return -EINVAL; netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); if (!netdev) return -EINVAL; err = bpf_dev_offload_check(netdev); if (err) goto out; prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY); down_write(&bpf_devs_lock); err = __bpf_prog_dev_bound_init(prog, netdev); up_write(&bpf_devs_lock); out: dev_put(netdev); return err; } int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) { int err; if (!bpf_prog_is_dev_bound(old_prog->aux)) return 0; if (bpf_prog_is_offloaded(old_prog->aux)) return -EINVAL; new_prog->aux->dev_bound = old_prog->aux->dev_bound; new_prog->aux->offload_requested = old_prog->aux->offload_requested; down_write(&bpf_devs_lock); if (!old_prog->aux->offload) { err = -EINVAL; goto out; } err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev); out: up_write(&bpf_devs_lock); return err; } int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) { ret = offload->offdev->ops->prepare(prog); offload->dev_state = !ret; } up_read(&bpf_devs_lock); return ret; } int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) ret = offload->offdev->ops->insn_hook(env, insn_idx, prev_insn_idx); up_read(&bpf_devs_lock); return ret; } int bpf_prog_offload_finalize(struct bpf_verifier_env *env) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { if (offload->offdev->ops->finalize) ret = offload->offdev->ops->finalize(env); else ret = 0; } up_read(&bpf_devs_lock); return ret; } void bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn) { const struct bpf_prog_offload_ops *ops; struct bpf_prog_offload *offload; int ret = -EOPNOTSUPP; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { ops = offload->offdev->ops; if (!offload->opt_failed && ops->replace_insn) ret = ops->replace_insn(env, off, insn); offload->opt_failed |= ret; } up_read(&bpf_devs_lock); } void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_prog_offload *offload; int ret = -EOPNOTSUPP; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { if (!offload->opt_failed && offload->offdev->ops->remove_insns) ret = offload->offdev->ops->remove_insns(env, off, cnt); offload->opt_failed |= ret; } up_read(&bpf_devs_lock); } void bpf_prog_dev_bound_destroy(struct bpf_prog *prog) { struct bpf_offload_netdev *ondev; struct net_device *netdev; rtnl_lock(); down_write(&bpf_devs_lock); if (prog->aux->offload) { list_del_init(&prog->aux->offload->offloads); netdev = prog->aux->offload->netdev; __bpf_prog_offload_destroy(prog); ondev = bpf_offload_find_netdev(netdev); if (!ondev->offdev && list_empty(&ondev->progs)) __bpf_offload_dev_netdev_unregister(NULL, netdev); } up_write(&bpf_devs_lock); rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) ret = offload->offdev->ops->translate(prog); up_read(&bpf_devs_lock); return ret; } static unsigned int bpf_prog_warn_on_exec(const void *ctx, const struct bpf_insn *insn) { WARN(1, "attempt to execute device eBPF program on the host!"); return 0; } int bpf_prog_offload_compile(struct bpf_prog *prog) { prog->bpf_func = bpf_prog_warn_on_exec; return bpf_prog_offload_translate(prog); } struct ns_get_path_bpf_prog_args { struct bpf_prog *prog; struct bpf_prog_info *info; }; static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) { struct ns_get_path_bpf_prog_args *args = private_data; struct bpf_prog_aux *aux = args->prog->aux; struct ns_common *ns; struct net *net; rtnl_lock(); down_read(&bpf_devs_lock); if (aux->offload) { args->info->ifindex = aux->offload->netdev->ifindex; net = dev_net(aux->offload->netdev); get_net(net); ns = &net->ns; } else { args->info->ifindex = 0; ns = NULL; } up_read(&bpf_devs_lock); rtnl_unlock(); return ns; } int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog) { struct ns_get_path_bpf_prog_args args = { .prog = prog, .info = info, }; struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; char __user *uinsns; int res; u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (res) { if (!info->ifindex) return -ENODEV; return res; } down_read(&bpf_devs_lock); if (!aux->offload) { up_read(&bpf_devs_lock); return -ENODEV; } ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { up_read(&bpf_devs_lock); return -EFAULT; } } up_read(&bpf_devs_lock); ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; path_put(&ns_path); return 0; } const struct bpf_prog_ops bpf_offload_prog_ops = { }; struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (attr->map_type != BPF_MAP_TYPE_ARRAY && attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE); if (!offmap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&offmap->map, attr); rtnl_lock(); down_write(&bpf_devs_lock); offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); err = bpf_dev_offload_check(offmap->netdev); if (err) goto err_unlock; ondev = bpf_offload_find_netdev(offmap->netdev); if (!ondev) { err = -EINVAL; goto err_unlock; } err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); return &offmap->map; err_unlock: up_write(&bpf_devs_lock); rtnl_unlock(); bpf_map_area_free(offmap); return ERR_PTR(err); } void bpf_map_offload_map_free(struct bpf_map *map) { struct bpf_offloaded_map *offmap = map_to_offmap(map); rtnl_lock(); down_write(&bpf_devs_lock); if (offmap->netdev) __bpf_map_offload_destroy(offmap); up_write(&bpf_devs_lock); rtnl_unlock(); bpf_map_area_free(offmap); } u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) { /* The memory dynamically allocated in netdev dev_ops is not counted */ return sizeof(struct bpf_offloaded_map); } int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; if (unlikely(flags > BPF_EXIST)) return -EINVAL; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_update_elem(offmap, key, value, flags); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_delete_elem(offmap, key); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); up_read(&bpf_devs_lock); return ret; } struct ns_get_path_bpf_map_args { struct bpf_offloaded_map *offmap; struct bpf_map_info *info; }; static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) { struct ns_get_path_bpf_map_args *args = private_data; struct ns_common *ns; struct net *net; rtnl_lock(); down_read(&bpf_devs_lock); if (args->offmap->netdev) { args->info->ifindex = args->offmap->netdev->ifindex; net = dev_net(args->offmap->netdev); get_net(net); ns = &net->ns; } else { args->info->ifindex = 0; ns = NULL; } up_read(&bpf_devs_lock); rtnl_unlock(); return ns; } int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct ns_get_path_bpf_map_args args = { .offmap = map_to_offmap(map), .info = info, }; struct inode *ns_inode; struct path ns_path; int res; res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); if (res) { if (!info->ifindex) return -ENODEV; return res; } ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; path_put(&ns_path); return 0; } static bool __bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) { struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; if (!bpf_prog_is_dev_bound(prog->aux)) return false; offload = prog->aux->offload; if (!offload) return false; if (offload->netdev == netdev) return true; ondev1 = bpf_offload_find_netdev(offload->netdev); ondev2 = bpf_offload_find_netdev(netdev); return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; } bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) { bool ret; down_read(&bpf_devs_lock); ret = __bpf_offload_dev_match(prog, netdev); up_read(&bpf_devs_lock); return ret; } EXPORT_SYMBOL_GPL(bpf_offload_dev_match); bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) { bool ret; if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux)) return false; down_read(&bpf_devs_lock); ret = lhs->aux->offload && rhs->aux->offload && lhs->aux->offload->netdev && lhs->aux->offload->netdev == rhs->aux->offload->netdev; up_read(&bpf_devs_lock); return ret; } bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; bool ret; if (!bpf_map_is_offloaded(map)) return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); down_read(&bpf_devs_lock); ret = __bpf_offload_dev_match(prog, offmap->netdev); up_read(&bpf_devs_lock); return ret; } int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { int err; down_write(&bpf_devs_lock); err = __bpf_offload_dev_netdev_register(offdev, netdev); up_write(&bpf_devs_lock); return err; } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { down_write(&bpf_devs_lock); __bpf_offload_dev_netdev_unregister(offdev, netdev); up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); if (!offdev) return ERR_PTR(-ENOMEM); offdev->ops = ops; offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; } EXPORT_SYMBOL_GPL(bpf_offload_dev_create); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) { WARN_ON(!list_empty(&offdev->netdevs)); kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) { return offdev->priv; } EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); void bpf_dev_bound_netdev_unregister(struct net_device *dev) { struct bpf_offload_netdev *ondev; ASSERT_RTNL(); down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(dev); if (ondev && !ondev->offdev) __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev); up_write(&bpf_devs_lock); } int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux) { if (!bpf_prog_is_dev_bound(prog_aux)) { bpf_log(log, "metadata kfuncs require device-bound program\n"); return -EINVAL; } if (bpf_prog_is_offloaded(prog_aux)) { bpf_log(log, "metadata kfuncs can't be offloaded\n"); return -EINVAL; } return 0; } void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) { const struct xdp_metadata_ops *ops; void *p = NULL; /* We don't hold bpf_devs_lock while resolving several * kfuncs and can race with the unregister_netdevice(). * We rely on bpf_dev_bound_match() check at attach * to render this program unusable. */ down_read(&bpf_devs_lock); if (!prog->aux->offload) goto out; ops = prog->aux->offload->netdev->xdp_metadata_ops; if (!ops) goto out; #define XDP_METADATA_KFUNC(name, _, __, xmo) \ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC out: up_read(&bpf_devs_lock); return p; } static int __init bpf_offload_init(void) { return rhashtable_init(&offdevs, &offdevs_params); } core_initcall(bpf_offload_init);
21 21 19 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * xt_connmark - Netfilter module to operate on connection marks * * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt <jengelh@medozas.de> */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connmark.h> MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>"); MODULE_DESCRIPTION("Xtables: connection mark operations"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_CONNMARK"); MODULE_ALIAS("ip6t_CONNMARK"); MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static unsigned int connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) { enum ip_conntrack_info ctinfo; u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; u_int32_t oldmark; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return XT_CONTINUE; switch (info->mode) { case XT_CONNMARK_SET: oldmark = READ_ONCE(ct->mark); newmark = (oldmark & ~info->ctmask) ^ info->ctmark; if (info->shift_dir == D_SHIFT_RIGHT) newmark >>= info->shift_bits; else newmark <<= info->shift_bits; if (READ_ONCE(ct->mark) != newmark) { WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_SAVE: new_targetmark = (skb->mark & info->nfmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else new_targetmark <<= info->shift_bits; newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^ new_targetmark; if (READ_ONCE(ct->mark) != newmark) { WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: new_targetmark = (READ_ONCE(ct->mark) & info->ctmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else new_targetmark <<= info->shift_bits; newmark = (skb->mark & ~info->nfmask) ^ new_targetmark; skb->mark = newmark; break; } return XT_CONTINUE; } static unsigned int connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo1 *info = par->targinfo; const struct xt_connmark_tginfo2 info2 = { .ctmark = info->ctmark, .ctmask = info->ctmask, .nfmask = info->nfmask, .mode = info->mode, }; return connmark_tg_shift(skb, &info2); } static unsigned int connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo2 *info = par->targinfo; return connmark_tg_shift(skb, info); } static int connmark_tg_check(const struct xt_tgchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static bool connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connmark_mtinfo1 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return false; return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert; } static int connmark_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, { .name = "CONNMARK", .revision = 2, .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "CONNMARK", .revision = 1, .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, { .name = "CONNMARK", .revision = 2, .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, #endif }; static struct xt_match connmark_mt_reg __read_mostly = { .name = "connmark", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), .destroy = connmark_mt_destroy, .me = THIS_MODULE, }; static int __init connmark_mt_init(void) { int ret; ret = xt_register_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&connmark_mt_reg); if (ret < 0) { xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); return ret; } return 0; } static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); } module_init(connmark_mt_init); module_exit(connmark_mt_exit);
7 5 2 7 5 5 2 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 // SPDX-License-Identifier: GPL-2.0+ /* * REINER SCT cyberJack pinpad/e-com USB Chipcard Reader Driver * * Copyright (C) 2001 REINER SCT * Author: Matthias Bruestle * * Contact: support@reiner-sct.com (see MAINTAINERS) * * This program is largely derived from work by the linux-usb group * and associated source files. Please see the usb/serial files for * individual credits and copyrights. * * Thanks to Greg Kroah-Hartman (greg@kroah.com) for his help and * patience. * * In case of problems, please write to the contact e-mail address * mentioned above. * * Please note that later models of the cyberjack reader family are * supported by a libusb-based userspace device driver. * * Homepage: http://www.reiner-sct.de/support/treiber_cyberjack.php#linux */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #define CYBERJACK_LOCAL_BUF_SIZE 32 #define DRIVER_AUTHOR "Matthias Bruestle" #define DRIVER_DESC "REINER SCT cyberJack pinpad/e-com USB Chipcard Reader Driver" #define CYBERJACK_VENDOR_ID 0x0C4B #define CYBERJACK_PRODUCT_ID 0x0100 /* Function prototypes */ static int cyberjack_port_probe(struct usb_serial_port *port); static void cyberjack_port_remove(struct usb_serial_port *port); static int cyberjack_open(struct tty_struct *tty, struct usb_serial_port *port); static void cyberjack_close(struct usb_serial_port *port); static int cyberjack_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count); static unsigned int cyberjack_write_room(struct tty_struct *tty); static void cyberjack_read_int_callback(struct urb *urb); static void cyberjack_read_bulk_callback(struct urb *urb); static void cyberjack_write_bulk_callback(struct urb *urb); static const struct usb_device_id id_table[] = { { USB_DEVICE(CYBERJACK_VENDOR_ID, CYBERJACK_PRODUCT_ID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); static struct usb_serial_driver cyberjack_device = { .driver = { .name = "cyberjack", }, .description = "Reiner SCT Cyberjack USB card reader", .id_table = id_table, .num_ports = 1, .num_bulk_out = 1, .port_probe = cyberjack_port_probe, .port_remove = cyberjack_port_remove, .open = cyberjack_open, .close = cyberjack_close, .write = cyberjack_write, .write_room = cyberjack_write_room, .read_int_callback = cyberjack_read_int_callback, .read_bulk_callback = cyberjack_read_bulk_callback, .write_bulk_callback = cyberjack_write_bulk_callback, }; static struct usb_serial_driver * const serial_drivers[] = { &cyberjack_device, NULL }; struct cyberjack_private { spinlock_t lock; /* Lock for SMP */ short rdtodo; /* Bytes still to read */ unsigned char wrbuf[5*64]; /* Buffer for collecting data to write */ short wrfilled; /* Overall data size we already got */ short wrsent; /* Data already sent */ }; static int cyberjack_port_probe(struct usb_serial_port *port) { struct cyberjack_private *priv; int result; priv = kmalloc(sizeof(struct cyberjack_private), GFP_KERNEL); if (!priv) return -ENOMEM; spin_lock_init(&priv->lock); priv->rdtodo = 0; priv->wrfilled = 0; priv->wrsent = 0; usb_set_serial_port_data(port, priv); result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) dev_err(&port->dev, "usb_submit_urb(read int) failed\n"); return 0; } static void cyberjack_port_remove(struct usb_serial_port *port) { struct cyberjack_private *priv; usb_kill_urb(port->interrupt_in_urb); priv = usb_get_serial_port_data(port); kfree(priv); } static int cyberjack_open(struct tty_struct *tty, struct usb_serial_port *port) { struct cyberjack_private *priv; unsigned long flags; dev_dbg(&port->dev, "%s - usb_clear_halt\n", __func__); usb_clear_halt(port->serial->dev, port->write_urb->pipe); priv = usb_get_serial_port_data(port); spin_lock_irqsave(&priv->lock, flags); priv->rdtodo = 0; priv->wrfilled = 0; priv->wrsent = 0; spin_unlock_irqrestore(&priv->lock, flags); return 0; } static void cyberjack_close(struct usb_serial_port *port) { usb_kill_urb(port->write_urb); usb_kill_urb(port->read_urb); } static int cyberjack_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { struct device *dev = &port->dev; struct cyberjack_private *priv = usb_get_serial_port_data(port); unsigned long flags; int result; int wrexpected; if (count == 0) { dev_dbg(dev, "%s - write request of 0 bytes\n", __func__); return 0; } if (!test_and_clear_bit(0, &port->write_urbs_free)) { dev_dbg(dev, "%s - already writing\n", __func__); return 0; } spin_lock_irqsave(&priv->lock, flags); if (count+priv->wrfilled > sizeof(priv->wrbuf)) { /* To much data for buffer. Reset buffer. */ priv->wrfilled = 0; spin_unlock_irqrestore(&priv->lock, flags); set_bit(0, &port->write_urbs_free); return 0; } /* Copy data */ memcpy(priv->wrbuf + priv->wrfilled, buf, count); usb_serial_debug_data(dev, __func__, count, priv->wrbuf + priv->wrfilled); priv->wrfilled += count; if (priv->wrfilled >= 3) { wrexpected = ((int)priv->wrbuf[2]<<8)+priv->wrbuf[1]+3; dev_dbg(dev, "%s - expected data: %d\n", __func__, wrexpected); } else wrexpected = sizeof(priv->wrbuf); if (priv->wrfilled >= wrexpected) { /* We have enough data to begin transmission */ int length; dev_dbg(dev, "%s - transmitting data (frame 1)\n", __func__); length = (wrexpected > port->bulk_out_size) ? port->bulk_out_size : wrexpected; memcpy(port->write_urb->transfer_buffer, priv->wrbuf, length); priv->wrsent = length; /* set up our urb */ port->write_urb->transfer_buffer_length = length; /* send the data out the bulk port */ result = usb_submit_urb(port->write_urb, GFP_ATOMIC); if (result) { dev_err(&port->dev, "%s - failed submitting write urb, error %d\n", __func__, result); /* Throw away data. No better idea what to do with it. */ priv->wrfilled = 0; priv->wrsent = 0; spin_unlock_irqrestore(&priv->lock, flags); set_bit(0, &port->write_urbs_free); return 0; } dev_dbg(dev, "%s - priv->wrsent=%d\n", __func__, priv->wrsent); dev_dbg(dev, "%s - priv->wrfilled=%d\n", __func__, priv->wrfilled); if (priv->wrsent >= priv->wrfilled) { dev_dbg(dev, "%s - buffer cleaned\n", __func__); memset(priv->wrbuf, 0, sizeof(priv->wrbuf)); priv->wrfilled = 0; priv->wrsent = 0; } } spin_unlock_irqrestore(&priv->lock, flags); return count; } static unsigned int cyberjack_write_room(struct tty_struct *tty) { /* FIXME: .... */ return CYBERJACK_LOCAL_BUF_SIZE; } static void cyberjack_read_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct cyberjack_private *priv = usb_get_serial_port_data(port); struct device *dev = &port->dev; unsigned char *data = urb->transfer_buffer; int status = urb->status; unsigned long flags; int result; /* the urb might have been killed. */ if (status) return; usb_serial_debug_data(dev, __func__, urb->actual_length, data); /* React only to interrupts signaling a bulk_in transfer */ if (urb->actual_length == 4 && data[0] == 0x01) { short old_rdtodo; /* This is a announcement of coming bulk_ins. */ unsigned short size = ((unsigned short)data[3]<<8)+data[2]+3; spin_lock_irqsave(&priv->lock, flags); old_rdtodo = priv->rdtodo; if (old_rdtodo > SHRT_MAX - size) { dev_dbg(dev, "Too many bulk_in urbs to do.\n"); spin_unlock_irqrestore(&priv->lock, flags); goto resubmit; } /* "+=" is probably more fault tolerant than "=" */ priv->rdtodo += size; dev_dbg(dev, "%s - rdtodo: %d\n", __func__, priv->rdtodo); spin_unlock_irqrestore(&priv->lock, flags); if (!old_rdtodo) { result = usb_submit_urb(port->read_urb, GFP_ATOMIC); if (result) dev_err(dev, "%s - failed resubmitting read urb, error %d\n", __func__, result); dev_dbg(dev, "%s - usb_submit_urb(read urb)\n", __func__); } } resubmit: result = usb_submit_urb(port->interrupt_in_urb, GFP_ATOMIC); if (result) dev_err(&port->dev, "usb_submit_urb(read int) failed\n"); dev_dbg(dev, "%s - usb_submit_urb(int urb)\n", __func__); } static void cyberjack_read_bulk_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct cyberjack_private *priv = usb_get_serial_port_data(port); struct device *dev = &port->dev; unsigned char *data = urb->transfer_buffer; unsigned long flags; short todo; int result; int status = urb->status; usb_serial_debug_data(dev, __func__, urb->actual_length, data); if (status) { dev_dbg(dev, "%s - nonzero read bulk status received: %d\n", __func__, status); return; } if (urb->actual_length) { tty_insert_flip_string(&port->port, data, urb->actual_length); tty_flip_buffer_push(&port->port); } spin_lock_irqsave(&priv->lock, flags); /* Reduce urbs to do by one. */ priv->rdtodo -= urb->actual_length; /* Just to be sure */ if (priv->rdtodo < 0) priv->rdtodo = 0; todo = priv->rdtodo; spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(dev, "%s - rdtodo: %d\n", __func__, todo); /* Continue to read if we have still urbs to do. */ if (todo /* || (urb->actual_length==port->bulk_in_endpointAddress)*/) { result = usb_submit_urb(port->read_urb, GFP_ATOMIC); if (result) dev_err(dev, "%s - failed resubmitting read urb, error %d\n", __func__, result); dev_dbg(dev, "%s - usb_submit_urb(read urb)\n", __func__); } } static void cyberjack_write_bulk_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct cyberjack_private *priv = usb_get_serial_port_data(port); struct device *dev = &port->dev; int status = urb->status; unsigned long flags; bool resubmitted = false; if (status) { dev_dbg(dev, "%s - nonzero write bulk status received: %d\n", __func__, status); set_bit(0, &port->write_urbs_free); return; } spin_lock_irqsave(&priv->lock, flags); /* only do something if we have more data to send */ if (priv->wrfilled) { int length, blksize, result; dev_dbg(dev, "%s - transmitting data (frame n)\n", __func__); length = ((priv->wrfilled - priv->wrsent) > port->bulk_out_size) ? port->bulk_out_size : (priv->wrfilled - priv->wrsent); memcpy(port->write_urb->transfer_buffer, priv->wrbuf + priv->wrsent, length); priv->wrsent += length; /* set up our urb */ port->write_urb->transfer_buffer_length = length; /* send the data out the bulk port */ result = usb_submit_urb(port->write_urb, GFP_ATOMIC); if (result) { dev_err(dev, "%s - failed submitting write urb, error %d\n", __func__, result); /* Throw away data. No better idea what to do with it. */ priv->wrfilled = 0; priv->wrsent = 0; goto exit; } resubmitted = true; dev_dbg(dev, "%s - priv->wrsent=%d\n", __func__, priv->wrsent); dev_dbg(dev, "%s - priv->wrfilled=%d\n", __func__, priv->wrfilled); blksize = ((int)priv->wrbuf[2]<<8)+priv->wrbuf[1]+3; if (priv->wrsent >= priv->wrfilled || priv->wrsent >= blksize) { dev_dbg(dev, "%s - buffer cleaned\n", __func__); memset(priv->wrbuf, 0, sizeof(priv->wrbuf)); priv->wrfilled = 0; priv->wrsent = 0; } } exit: spin_unlock_irqrestore(&priv->lock, flags); if (!resubmitted) set_bit(0, &port->write_urbs_free); usb_serial_port_softint(port); } module_usb_serial_driver(serial_drivers, id_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 // SPDX-License-Identifier: GPL-2.0 #include <linux/blkdev.h> #include <linux/iversion.h> #include "ctree.h" #include "fs.h" #include "messages.h" #include "compression.h" #include "delalloc-space.h" #include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" #include "accessors.h" #include "file-item.h" #include "file.h" #include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, const u64 olen, int no_time_update) { int ret; inode_inc_iversion(inode); if (!no_time_update) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. */ if (endoff > destoff + olen) endoff = destoff + olen; if (endoff > inode->i_size) { i_size_write(inode, endoff); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } ret = btrfs_end_transaction(trans); out: return ret; } static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, const u64 datal, const u8 comp_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 block_size = fs_info->sectorsize; const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); /* * We have flushed and locked the ranges of the source and destination * inodes, we also have locked the inodes, so we are safe to do a * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, block_size); if (ret) goto out; folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, btrfs_alloc_write_mask(mapping)); if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; /* * After dirtying the page our caller will need to start a transaction, * and if we are low on metadata free space, that can cause flushing of * delalloc for all inodes in order to get metadata space released. * However we are holding the range locked for the whole duration of * the clone/dedupe operation, so we may deadlock if that happens and no * other task releases enough space. So mark this inode as not being * possible to flush to avoid such deadlock. We will clear that flag * when we finish cloning all extents, since a transaction is started * after finding each extent to clone. */ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, datal); } else { ret = btrfs_decompress(comp_type, data_start, folio, offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; flush_dcache_folio(folio); } /* * If our inline data is smaller then the block/page size, then the * remaining of the block/page is equivalent to zeroes. We had something * like the following done: * * $ xfs_io -f -c "pwrite -S 0xab 0 500" file * $ sync # (or fsync) * $ xfs_io -c "falloc 0 4K" file * $ xfs_io -c "pwrite -S 0xcd 4K 4K" * * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) folio_zero_range(folio, datal, block_size - datal); btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: if (!IS_ERR(folio)) { folio_unlock(folio); folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, block_size, true); btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); return ret; } /* * Deal with cloning of inline extents. We try to copy the inline extent from * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, const u64 datal, const u64 size, const u8 comp_type, char *inline_data, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_drop_extents_args drop_args = { 0 }; int ret; struct btrfs_key key; if (new_key->offset > 0) { ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; } key.objectid = btrfs_ino(BTRFS_I(dst)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { return ret; } else if (ret > 0) { if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; else if (ret > 0) goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == btrfs_ino(BTRFS_I(dst)) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the * inline extent's data to the page. */ ASSERT(key.offset > 0); goto copy_to_page; } } else if (i_size_read(dst) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); /* * If it's an inline extent replace it with the source inline * extent, otherwise copy the source inline extent data into * the respective page at the destination inode. */ if (btrfs_file_extent_type(path->nodes[0], ei) == BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; goto copy_to_page; } copy_inline_extent: /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ if (i_size_read(dst) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ goto copy_to_page; } /* * Release path before starting a new transaction so we don't hold locks * that would confuse lockdep. */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf * of the destination inode. We know we will drop or adjust at most one * extent item in the destination root. * * 1 unit - adjusting old extent (we may have to split it) * 1 unit - add new extent * 1 unit - inode update */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } drop_args.path = path; drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); if (ret) goto out; write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); btrfs_set_inode_full_sync(BTRFS_I(dst)); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: if (!ret && !trans) { /* * No transaction here means we copied the inline extent into a * page of the destination inode. * * 1 unit to update inode item */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; } } if (ret && trans) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } if (!ret) *trans_out = trans; return ret; copy_to_page: /* * Release our path because we don't need it anymore and also because * copy_inline_to_page() needs to reserve data and metadata, which may * need to flush delalloc when we are low on available space and * therefore cause a deadlock if writeback of an inline extent needs to * write to the same leaf or an ordered extent completion needs to write * to the same leaf. */ btrfs_release_path(path); ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; } /* * Clone a range from inode file to another. * * @src: Inode to clone from * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, int no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = NULL; struct extent_buffer *leaf; struct btrfs_trans_handle *trans; char *buf = NULL; struct btrfs_key key; u32 nritems; int slot; int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!buf) return ret; path = btrfs_alloc_path(); if (!path) { kvfree(buf); return ret; } path->reada = READA_FORWARD; /* Clone data */ key.objectid = btrfs_ino(BTRFS_I(src)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = off; while (1) { struct btrfs_file_extent_item *extent; u64 extent_gen; int type; u32 size; struct btrfs_key new_key; u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; u64 drop_start; /* Note the key will change type as we walk through the tree */ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) goto out; /* * First search, if no extent item that starts at offset off was * found but the previous item is an extent item, it's possible * it might overlap our target range, therefore process it. */ if (key.offset == off && ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } nritems = btrfs_header_nritems(path->nodes[0]); process_slot: if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) goto out; if (ret > 0) break; nritems = btrfs_header_nritems(path->nodes[0]); } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type > BTRFS_EXTENT_DATA_KEY || key.objectid != btrfs_ino(BTRFS_I(src))) break; ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); extent_gen = btrfs_file_extent_generation(leaf, extent); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { disko = btrfs_file_extent_disk_bytenr(leaf, extent); diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); datao = btrfs_file_extent_offset(leaf, extent); datal = btrfs_file_extent_num_bytes(leaf, extent); } else if (type == BTRFS_FILE_EXTENT_INLINE) { /* Take upper bound, may be compressed */ datal = btrfs_file_extent_ram_bytes(leaf, extent); } /* * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. * * Subsequent searches may leave us on a file range we have * processed before - this happens due to a race with ordered * extent completion for a file range that is outside our source * range, but that range was part of a file extent item that * also covered a leading part of our source range. */ if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); btrfs_release_path(path); memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(BTRFS_I(inode)); if (off <= key.offset) new_key.offset = key.offset + destoff - off; else new_key.offset = destoff; /* * Deal with a hole that doesn't have an extent item that * represents it (NO_HOLES feature enabled). * This hole is either in the middle of the cloning range or at * the beginning (fully overlaps it or partially overlaps it). */ if (new_key.offset != last_dest_end) drop_start = last_dest_end; else drop_start = new_key.offset; if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b * | ------------- extent ------------- | */ /* Subtract range b */ if (key.offset + datal > off + len) datal = off + len - key.offset; /* Subtract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } clone_info.disk_offset = disko; clone_info.disk_len = diskl; clone_info.data_offset = datao; clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) goto out; } else { ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can * never clone only parts of an inline extent, since all * reflink operations must start at a sector size aligned * offset, and the length must be aligned too or end at * the i_size (which implies the whole inlined data). */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || WARN_ON(key.offset != 0) || WARN_ON(datal > fs_info->sectorsize)) { ret = -EUCLEAN; goto out; } ret = clone_copy_inline_extent(inode, path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) goto out; } btrfs_release_path(path); /* * Whenever we share an extent we update the last_reflink_trans * of each inode to the current transaction. This is needed to * make sure fsync does not log multiple checksum items with * overlapping ranges (because some extent items might refer * only to sections of the original extent). For the destination * inode we do this regardless of the generation of the extents * or even if they are inline extents or explicit holes, to make * sure a full fsync does not skip them. For the source inode, * we only need to update last_reflink_trans in case it's a new * extent that is not a hole or an inline extent, to deal with * the checksums problem on fsync. */ if (extent_gen == trans->transid && disko > 0) BTRFS_I(src)->last_reflink_trans = trans->transid; BTRFS_I(inode)->last_reflink_trans = trans->transid; last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, destoff, olen, no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) break; btrfs_release_path(path); key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); } ret = 0; if (last_dest_end < destoff + len) { /* * We have an implicit hole that fully or partially overlaps our * cloning range at its end. This means that we either have the * NO_HOLES feature enabled or the implicit hole happened due to * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); /* * When using NO_HOLES and we are cloning a range that covers * only a hole (no extents) into a range beyond the current * i_size, punching a hole in the target range will not create * an extent map defining a hole, because the range starts at or * beyond current i_size. If the file previously had an i_size * greater than the new i_size set by this clone operation, we * need to make sure the next fsync is a full fsync, so that it * detects and logs a hole covering a range from the current * i_size to the new i_size. If the clone range covers extents, * besides a hole, then we know the full sync flag was already * set by previous calls to btrfs_replace_file_extents() that * replaced file extent items. */ if (last_dest_end >= i_size_read(inode)) btrfs_set_inode_full_sync(BTRFS_I(inode)); ret = btrfs_replace_file_extents(BTRFS_I(inode), path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } out: btrfs_free_path(path); kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); return ret; } static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); down_write(&BTRFS_I(inode1)->i_mmap_lock); down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); } static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) { up_write(&BTRFS_I(inode1)->i_mmap_lock); up_write(&BTRFS_I(inode2)->i_mmap_lock); } static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; /* * Lock destination range to serialize with concurrent readahead(), and * we are safe from concurrency with relocation of source extents * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); return ret; } static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret = 0; u64 i, tail_len, chunk_count; struct btrfs_root *root_dst = BTRFS_I(dst)->root; spin_lock(&root_dst->root_item_lock); if (root_dst->send_in_progress) { btrfs_warn_rl(root_dst->fs_info, "cannot deduplicate to root %llu while send operations are using it (%d in progress)", btrfs_root_id(root_dst), root_dst->send_in_progress); spin_unlock(&root_dst->root_item_lock); return -EAGAIN; } root_dst->dedupe_in_progress++; spin_unlock(&root_dst->root_item_lock); tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, dst, dst_loff); if (ret) goto out; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; } if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; spin_unlock(&root_dst->root_item_lock); return ret; } static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 off, u64 olen, u64 destoff) { struct extent_state *cached_state = NULL; struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; int wb_ret; u64 len = olen; u64 bs = fs_info->sectorsize; u64 end; /* * VFS's generic_remap_file_range_prep() protects us from cloning the * eof block into the middle of a file, which would result in corruption * if the file size is not blocksize aligned. So we don't need to check * for that case here. */ if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; if (destoff > inode->i_size) { const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); if (ret) return ret; /* * We may have truncated the last block if the inode's size is * not sector size aligned, so we need to wait for writeback to * complete before proceeding further, otherwise we can race * with cloning and attempt to increment a reference to an * extent that no longer exists (writeback completed right after * we found the previous extent covering eof and before we * attempted to increment its reference count). */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start, destoff - wb_start); if (ret) return ret; } /* * Lock destination range to serialize with concurrent readahead(), and * we are safe from concurrency with relocation of source extents * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ end = destoff + len - 1; lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination * range, so wait for writeback to complete before truncating pages * from the page cache. This is a rare case. */ wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); ret = ret ? ret : wb_ret; /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. */ truncate_inode_pages_range(&inode->i_data, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); btrfs_btree_balance_dirty(fs_info); return ret; } static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { struct btrfs_root *root_out = BTRFS_I(inode_out)->root; if (btrfs_root_readonly(root_out)) return -EROFS; ASSERT(inode_in->i_sb == inode_out->i_sb); } /* Don't make the dst file partly checksummed */ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } /* * Now that the inodes are locked, we need to start writeback ourselves * and can not rely on the writeback from the VFS's generic helper * generic_remap_file_range_prep() because: * * 1) For compression we must call filemap_fdatawrite_range() range * twice (btrfs_fdatawrite_range() does it for us), and the generic * helper only calls it once; * * 2) filemap_fdatawrite_range(), called by the generic helper only * waits for the writeback to complete, i.e. for IO to be done, and * not for the ordered extents to complete. We need to wait for them * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they * work at the whole extent level. * NOCOW buffered write without data space reserved may not be able * to fall back to CoW due to lack of data space, thus could cause * data loss. * * Here we take a shortcut by flushing the whole inode, so that all * nocow write should reach disk as nocow before we increase the * reference of the extent. We could do better by only flushing NOCOW * data, but that needs extra accounting. * * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ ret = filemap_flush(inode_in->i_mapping); if (ret < 0) return ret; ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); } static bool file_sync_write(const struct file *file) { if (file->f_flags & (__O_SYNC | O_DSYNC)) return true; if (IS_SYNC(file_inode(file))) return true; return false; } loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { struct inode *src_inode = file_inode(src_file); struct inode *dst_inode = file_inode(dst_file); bool same_inode = dst_inode == src_inode; int ret; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; if (same_inode) { btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { lock_two_nondirectories(src_inode, dst_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); if (ret < 0 || len == 0) goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); } /* * If either the source or the destination file was opened with O_SYNC, * O_DSYNC or has the S_SYNC attribute, fsync both the destination and * source files/ranges, so that after a successful return (0) followed * by a power failure results in the reflinked data to be readable from * both files/ranges. */ if (ret == 0 && len > 0 && (file_sync_write(src_file) || file_sync_write(dst_file))) { ret = btrfs_sync_file(src_file, off, off + len - 1, 0); if (ret == 0) ret = btrfs_sync_file(dst_file, destoff, destoff + len - 1, 0); } return ret < 0 ? ret : len; }
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 1 1 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/iomap.h> #include <linux/ktime.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "inode.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "log.h" #include "super.h" #include "trans.h" #include "dir.h" #include "util.h" #include "aops.h" #include "trace_gfs2.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k * block is 512, so __u16 is fine for that. It saves stack space to * keep it small. */ struct metapath { struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; __u16 mp_list[GFS2_MAX_META_HEIGHT]; int mp_fheight; /* find_metapath height */ int mp_aheight; /* actual height (lookup height) */ }; static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length); /** * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated * @folio: The folio. * * Returns: errno */ static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct folio *folio) { struct inode *inode = &ip->i_inode; if (!folio_test_uptodate(folio)) { void *kaddr = kmap_local_folio(folio, 0); u64 dsize = i_size_read(inode); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, folio_size(folio) - dsize); kunmap_local(kaddr); folio_mark_uptodate(folio); } if (gfs2_is_jdata(ip)) { struct buffer_head *bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, BIT(inode->i_blkbits), BIT(BH_Uptodate)); if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } else { folio_mark_dirty(folio); gfs2_ordered_add_inode(ip); } return 0; } static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *bh, *dibh; struct gfs2_dinode *di; u64 block = 0; int isdir = gfs2_is_dir(ip); int error; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) return error; if (i_size_read(&ip->i_inode)) { /* Get a free block, fill it with the stuffed data, and write it out to disk */ unsigned int n = 1; error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) goto out_brelse; if (isdir) { gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1); error = gfs2_dir_get_new_buffer(ip, block, &bh); if (error) goto out_brelse; gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header), dibh, sizeof(struct gfs2_dinode)); brelse(bh); } else { error = gfs2_unstuffer_folio(ip, dibh, block, folio); if (error) goto out_brelse; } } /* Set up the pointer to the new block */ gfs2_trans_add_meta(ip->i_gl, dibh); di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); if (i_size_read(&ip->i_inode)) { *(__be64 *)(di + 1) = cpu_to_be64(block); gfs2_add_inode_blocks(&ip->i_inode, 1); di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); } ip->i_height = 1; di->di_height = cpu_to_be16(1); out_brelse: brelse(dibh); return error; } /** * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big * @ip: The GFS2 inode to unstuff * * This routine unstuffs a dinode and returns it to a "normal" state such * that the height can be grown in the traditional way. * * Returns: errno */ int gfs2_unstuff_dinode(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; struct folio *folio; int error; down_write(&ip->i_rw_mutex); folio = filemap_grab_folio(inode->i_mapping, 0); error = PTR_ERR(folio); if (IS_ERR(folio)) goto out; error = __gfs2_unstuff_inode(ip, folio); folio_unlock(folio); folio_put(folio); out: up_write(&ip->i_rw_mutex); return error; } /** * find_metapath - Find path through the metadata tree * @sdp: The superblock * @block: The disk block to look up * @mp: The metapath to return the result in * @height: The pre-calculated height of the metadata tree * * This routine returns a struct metapath structure that defines a path * through the metadata of inode "ip" to get to block "block". * * Example: * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a * filesystem with a blocksize of 4096. * * find_metapath() would return a struct metapath structure set to: * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165. * * That means that in order to get to the block containing the byte at * offset 101342453, we would load the indirect block pointed to by pointer * 0 in the dinode. We would then load the indirect block pointed to by * pointer 48 in that indirect block. We would then load the data block * pointed to by pointer 165 in that indirect block. * * ---------------------------------------- * | Dinode | | * | | 4| * | |0 1 2 3 4 5 9| * | | 6| * ---------------------------------------- * | * | * V * ---------------------------------------- * | Indirect Block | * | 5| * | 4 4 4 4 4 5 5 1| * |0 5 6 7 8 9 0 1 2| * ---------------------------------------- * | * | * V * ---------------------------------------- * | Indirect Block | * | 1 1 1 1 1 5| * | 6 6 6 6 6 1| * |0 3 4 5 6 7 2| * ---------------------------------------- * | * | * V * ---------------------------------------- * | Data block containing offset | * | 101342453 | * | | * | | * ---------------------------------------- * */ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, struct metapath *mp, unsigned int height) { unsigned int i; mp->mp_fheight = height; for (i = height; i--;) mp->mp_list[i] = do_div(block, sdp->sd_inptrs); } static inline unsigned int metapath_branch_start(const struct metapath *mp) { if (mp->mp_list[0] == 0) return 2; return 1; } /** * metaptr1 - Return the first possible metadata pointer in a metapath buffer * @height: The metadata height (0 = dinode) * @mp: The metapath */ static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp) { struct buffer_head *bh = mp->mp_bh[height]; if (height == 0) return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode))); return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header))); } /** * metapointer - Return pointer to start of metadata in a buffer * @height: The metadata height (0 = dinode) * @mp: The metapath * * Return a pointer to the block number of the next height of the metadata * tree given a buffer containing the pointer to the current height of the * metadata tree. */ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) { __be64 *p = metaptr1(height, mp); return p + mp->mp_list[height]; } static inline const __be64 *metaend(unsigned int height, const struct metapath *mp) { const struct buffer_head *bh = mp->mp_bh[height]; return (const __be64 *)(bh->b_data + bh->b_size); } static void clone_metapath(struct metapath *clone, struct metapath *mp) { unsigned int hgt; *clone = *mp; for (hgt = 0; hgt < mp->mp_aheight; hgt++) get_bh(clone->mp_bh[hgt]); } static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) { const __be64 *t; for (t = start; t < end; t++) { struct buffer_head *rabh; if (!*t) continue; rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | REQ_PRIO, rabh); continue; } unlock_buffer(rabh); } brelse(rabh); } } static inline struct buffer_head * metapath_dibh(struct metapath *mp) { return mp->mp_bh[0]; } static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, unsigned int x, unsigned int h) { for (; x < h; x++) { __be64 *ptr = metapointer(x, mp); u64 dblock = be64_to_cpu(*ptr); int ret; if (!dblock) break; ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]); if (ret) return ret; } mp->mp_aheight = x + 1; return 0; } /** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode * @mp: The metapath * * Assumes that the inode's buffer has already been looked up and * hooked onto mp->mp_bh[0] and that the metapath has been initialised * by find_metapath(). * * If this function encounters part of the tree which has not been * allocated, it returns the current height of the tree at the point * at which it found the unallocated block. Blocks which are found are * added to the mp->mp_bh[] list. * * Returns: error */ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { return __fillup_metapath(ip, mp, 0, ip->i_height - 1); } /** * fillup_metapath - fill up buffers for the metadata path to a specific height * @ip: The inode * @mp: The metapath * @h: The height to which it should be mapped * * Similar to lookup_metapath, but does lookups for a range of heights * * Returns: error or the number of buffers filled */ static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) { unsigned int x = 0; int ret; if (h) { /* find the first buffer we need to look up. */ for (x = h - 1; x > 0; x--) { if (mp->mp_bh[x]) break; } } ret = __fillup_metapath(ip, mp, x, h); if (ret) return ret; return mp->mp_aheight - x - 1; } static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp) { sector_t factor = 1, block = 0; int hgt; for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) { if (hgt < mp->mp_aheight) block += mp->mp_list[hgt] * factor; factor *= sdp->sd_inptrs; } return block; } static void release_metapath(struct metapath *mp) { int i; for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); mp->mp_bh[i] = NULL; } } /** * gfs2_extent_length - Returns length of an extent of blocks * @bh: The metadata block * @ptr: Current position in @bh * @eob: Set to 1 if we hit "end of block" * * Returns: The length of the extent (minimum of one block) */ static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob) { const __be64 *end = (__be64 *)(bh->b_data + bh->b_size); const __be64 *first = ptr; u64 d = be64_to_cpu(*ptr); *eob = 0; do { ptr++; if (ptr >= end) break; d++; } while(be64_to_cpu(*ptr) == d); if (ptr >= end) *eob = 1; return ptr - first; } enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE }; /* * gfs2_metadata_walker - walk an indirect block * @mp: Metapath to indirect block * @ptrs: Number of pointers to look at * * When returning WALK_FOLLOW, the walker must update @mp to point at the right * indirect block to follow. */ typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp, unsigned int ptrs); /* * gfs2_walk_metadata - walk a tree of indirect blocks * @inode: The inode * @mp: Starting point of walk * @max_len: Maximum number of blocks to walk * @walker: Called during the walk * * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or * past the end of metadata, and a negative error code otherwise. */ static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp, u64 max_len, gfs2_metadata_walker walker) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); u64 factor = 1; unsigned int hgt; int ret; /* * The walk starts in the lowest allocated indirect block, which may be * before the position indicated by @mp. Adjust @max_len accordingly * to avoid a short walk. */ for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) { max_len += mp->mp_list[hgt] * factor; mp->mp_list[hgt] = 0; factor *= sdp->sd_inptrs; } for (;;) { u16 start = mp->mp_list[hgt]; enum walker_status status; unsigned int ptrs; u64 len; /* Walk indirect block. */ ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start; len = ptrs * factor; if (len > max_len) ptrs = DIV_ROUND_UP_ULL(max_len, factor); status = walker(mp, ptrs); switch (status) { case WALK_STOP: return 1; case WALK_FOLLOW: BUG_ON(mp->mp_aheight == mp->mp_fheight); ptrs = mp->mp_list[hgt] - start; len = ptrs * factor; break; case WALK_CONTINUE: break; } if (len >= max_len) break; max_len -= len; if (status == WALK_FOLLOW) goto fill_up_metapath; lower_metapath: /* Decrease height of metapath. */ brelse(mp->mp_bh[hgt]); mp->mp_bh[hgt] = NULL; mp->mp_list[hgt] = 0; if (!hgt) break; hgt--; factor *= sdp->sd_inptrs; /* Advance in metadata tree. */ (mp->mp_list[hgt])++; if (hgt) { if (mp->mp_list[hgt] >= sdp->sd_inptrs) goto lower_metapath; } else { if (mp->mp_list[hgt] >= sdp->sd_diptrs) break; } fill_up_metapath: /* Increase height of metapath. */ ret = fillup_metapath(ip, mp, ip->i_height - 1); if (ret < 0) return ret; hgt += ret; for (; ret; ret--) do_div(factor, sdp->sd_inptrs); mp->mp_aheight = hgt + 1; } return 0; } static enum walker_status gfs2_hole_walker(struct metapath *mp, unsigned int ptrs) { const __be64 *start, *ptr, *end; unsigned int hgt; hgt = mp->mp_aheight - 1; start = metapointer(hgt, mp); end = start + ptrs; for (ptr = start; ptr < end; ptr++) { if (*ptr) { mp->mp_list[hgt] += ptr - start; if (mp->mp_aheight == mp->mp_fheight) return WALK_STOP; return WALK_FOLLOW; } } return WALK_CONTINUE; } /** * gfs2_hole_size - figure out the size of a hole * @inode: The inode * @lblock: The logical starting block number * @len: How far to look (in blocks) * @mp: The metapath at lblock * @iomap: The iomap to store the hole size in * * This function modifies @mp. * * Returns: errno on error */ static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len, struct metapath *mp, struct iomap *iomap) { struct metapath clone; u64 hole_size; int ret; clone_metapath(&clone, mp); ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker); if (ret < 0) goto out; if (ret == 1) hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock; else hole_size = len; iomap->length = hole_size << inode->i_blkbits; ret = 0; out: release_metapath(&clone); return ret; } static inline void gfs2_indirect_init(struct metapath *mp, struct gfs2_glock *gl, unsigned int i, unsigned offset, u64 bn) { __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + ((i > 1) ? sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode))); BUG_ON(i < 1); BUG_ON(mp->mp_bh[i] != NULL); mp->mp_bh[i] = gfs2_meta_new(gl, bn); gfs2_trans_add_meta(gl, mp->mp_bh[i]); gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; *ptr = cpu_to_be64(bn); } enum alloc_state { ALLOC_DATA = 0, ALLOC_GROW_DEPTH = 1, ALLOC_GROW_HEIGHT = 2, /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ }; /** * __gfs2_iomap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode * @iomap: The iomap structure * @mp: The metapath, with proper height information calculated * * In this routine we may have to alloc: * i) Indirect blocks to grow the metadata tree height * ii) Indirect blocks to fill in lower part of the metadata tree * iii) Data blocks * * This function is called after __gfs2_iomap_get, which works out the * total number of blocks which we need via gfs2_alloc_size. * * We then do the actual allocation asking for an extent at a time (if * enough contiguous free blocks are available, there will only be one * allocation request per call) and uses the state machine to initialise * the blocks in order. * * Right now, this function will allocate at most one indirect block * worth of data -- with a default block size of 4K, that's slightly * less than 2M. If this limitation is ever removed to allow huge * allocations, we would probably still want to limit the iomap size we * return to avoid stalling other tasks during huge writes; the next * iomap iteration would then find the blocks already allocated. * * Returns: errno on error */ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh = metapath_dibh(mp); u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; size_t dblks = iomap->length >> inode->i_blkbits; const unsigned end_of_metadata = mp->mp_fheight - 1; int ret; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; BUG_ON(mp->mp_aheight < 1); BUG_ON(dibh == NULL); BUG_ON(dblks < 1); gfs2_trans_add_meta(ip->i_gl, dibh); down_write(&ip->i_rw_mutex); if (mp->mp_fheight == mp->mp_aheight) { /* Bottom indirect block exists */ state = ALLOC_DATA; } else { /* Need to allocate indirect blocks */ if (mp->mp_fheight == ip->i_height) { /* Writing into existing tree, extend tree down */ iblks = mp->mp_fheight - mp->mp_aheight; state = ALLOC_GROW_DEPTH; } else { /* Building up tree height */ state = ALLOC_GROW_HEIGHT; iblks = mp->mp_fheight - ip->i_height; branch_start = metapath_branch_start(mp); iblks += (mp->mp_fheight - branch_start); } } /* start of the second part of the function (state machine) */ blks = dblks + iblks; i = mp->mp_aheight; do { n = blks - alloced; ret = gfs2_alloc_blocks(ip, &bn, &n, 0); if (ret) goto out; alloced += n; if (state != ALLOC_DATA || gfs2_is_jdata(ip)) gfs2_trans_remove_revoke(sdp, bn, n); switch (state) { /* Growing height of tree */ case ALLOC_GROW_HEIGHT: if (i == 1) { ptr = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); zero_bn = *ptr; } for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); if (i - 1 == mp->mp_fheight - ip->i_height) { i--; gfs2_buffer_copy_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header), dibh, sizeof(struct gfs2_dinode)); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + sizeof(__be64)); ptr = (__be64 *)(mp->mp_bh[i]->b_data + sizeof(struct gfs2_meta_header)); *ptr = zero_bn; state = ALLOC_GROW_DEPTH; for(i = branch_start; i < mp->mp_fheight; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); mp->mp_bh[i] = NULL; } i = branch_start; } if (n == 0) break; fallthrough; /* To branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < mp->mp_fheight) gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); for (; i < mp->mp_fheight && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, mp->mp_list[i-1], bn++); if (i == mp->mp_fheight) state = ALLOC_DATA; if (n == 0) break; fallthrough; /* To tree complete, adding data blocks */ case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); dblks = n; ptr = metapointer(end_of_metadata, mp); iomap->addr = bn << inode->i_blkbits; iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); break; } } while (iomap->addr == IOMAP_NULL_ADDR); iomap->type = IOMAP_MAPPED; iomap->length = (u64)dblks << inode->i_blkbits; ip->i_height = mp->mp_fheight; gfs2_add_inode_blocks(&ip->i_inode, alloced); gfs2_dinode_out(ip, dibh->b_data); out: up_write(&ip->i_rw_mutex); return ret; } #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE /** * gfs2_alloc_size - Compute the maximum allocation size * @inode: The inode * @mp: The metapath * @size: Requested size in blocks * * Compute the maximum size of the next allocation at @mp. * * Returns: size in blocks */ static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); const __be64 *first, *ptr, *end; /* * For writes to stuffed files, this function is called twice via * __gfs2_iomap_get, before and after unstuffing. The size we return the * first time needs to be large enough to get the reservation and * allocation sizes right. The size we return the second time must * be exact or else __gfs2_iomap_alloc won't do the right thing. */ if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) { unsigned int maxsize = mp->mp_fheight > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; maxsize -= mp->mp_list[mp->mp_fheight - 1]; if (size > maxsize) size = maxsize; return size; } first = metapointer(ip->i_height - 1, mp); end = metaend(ip->i_height - 1, mp); if (end - first > size) end = first + size; for (ptr = first; ptr < end; ptr++) { if (*ptr) break; } return ptr - first; } /** * __gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode * @pos: Starting position in bytes * @length: Length to map, in bytes * @flags: iomap flags * @iomap: The iomap structure * @mp: The metapath * * Returns: errno */ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); loff_t size = i_size_read(inode); __be64 *ptr; sector_t lblock; sector_t lblock_stop; int ret; int eob; u64 len; struct buffer_head *dibh = NULL, *bh; u8 height; if (!length) return -EINVAL; down_read(&ip->i_rw_mutex); ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) goto unlock; mp->mp_bh[0] = dibh; if (gfs2_is_stuffed(ip)) { if (flags & IOMAP_WRITE) { loff_t max_size = gfs2_max_stuffed_size(ip); if (pos + length > max_size) goto unstuff; iomap->length = max_size; } else { if (pos >= size) { if (flags & IOMAP_REPORT) { ret = -ENOENT; goto unlock; } else { iomap->offset = pos; iomap->length = length; goto hole_found; } } iomap->length = size; } iomap->addr = (ip->i_no_addr << inode->i_blkbits) + sizeof(struct gfs2_dinode); iomap->type = IOMAP_INLINE; iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode); goto out; } unstuff: lblock = pos >> inode->i_blkbits; iomap->offset = lblock << inode->i_blkbits; lblock_stop = (pos + length - 1) >> inode->i_blkbits; len = lblock_stop - lblock + 1; iomap->length = len << inode->i_blkbits; height = ip->i_height; while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) height++; find_metapath(sdp, lblock, mp, height); if (height > ip->i_height || gfs2_is_stuffed(ip)) goto do_alloc; ret = lookup_metapath(ip, mp); if (ret) goto unlock; if (mp->mp_aheight != ip->i_height) goto do_alloc; ptr = metapointer(ip->i_height - 1, mp); if (*ptr == 0) goto do_alloc; bh = mp->mp_bh[ip->i_height - 1]; len = gfs2_extent_length(bh, ptr, &eob); iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; iomap->length = len << inode->i_blkbits; iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; if (eob) iomap->flags |= IOMAP_F_GFS2_BOUNDARY; out: iomap->bdev = inode->i_sb->s_bdev; unlock: up_read(&ip->i_rw_mutex); return ret; do_alloc: if (flags & IOMAP_REPORT) { if (pos >= size) ret = -ENOENT; else if (height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else iomap->length = size - iomap->offset; } else if (flags & IOMAP_WRITE) { u64 alloc_size; if (flags & IOMAP_DIRECT) goto out; /* (see gfs2_file_direct_write) */ len = gfs2_alloc_size(inode, mp, len); alloc_size = len << inode->i_blkbits; if (alloc_size < iomap->length) iomap->length = alloc_size; } else { if (pos < size && height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); } hole_found: iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; goto out; } static struct folio * gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) { struct inode *inode = iter->inode; unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocks; struct folio *folio; int status; blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); if (status) return ERR_PTR(status); folio = iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) gfs2_trans_end(sdp); return folio; } static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, unsigned copied, struct folio *folio) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); if (!gfs2_is_stuffed(ip)) gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos), copied); folio_unlock(folio); folio_put(folio); if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); } static const struct iomap_folio_ops gfs2_iomap_folio_ops = { .get_folio = gfs2_iomap_get_folio, .put_folio = gfs2_iomap_put_folio, }; static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); bool unstuff; int ret; unstuff = gfs2_is_stuffed(ip) && pos + length > gfs2_max_stuffed_size(ip); if (unstuff || iomap->type == IOMAP_HOLE) { unsigned int data_blocks, ind_blocks; struct gfs2_alloc_parms ap = {}; unsigned int rblocks; struct gfs2_trans *tr; gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_quota_lock_check(ip, &ap); if (ret) return ret; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_qunlock; rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks; if (ind_blocks || data_blocks) rblocks += RES_STATFS + RES_QUOTA; if (inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits); if (ret) goto out_trans_fail; if (unstuff) { ret = gfs2_unstuff_dinode(ip); if (ret) goto out_trans_end; release_metapath(mp); ret = __gfs2_iomap_get(inode, iomap->offset, iomap->length, flags, iomap, mp); if (ret) goto out_trans_end; } if (iomap->type == IOMAP_HOLE) { ret = __gfs2_iomap_alloc(inode, iomap, mp); if (ret) { gfs2_trans_end(sdp); gfs2_inplace_release(ip); punch_hole(ip, iomap->offset, iomap->length); goto out_qunlock; } } tr = current->journal_info; if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); } if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) iomap->folio_ops = &gfs2_iomap_folio_ops; return 0; out_trans_end: gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); out_qunlock: gfs2_quota_unlock(ip); return ret; } static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct gfs2_inode *ip = GFS2_I(inode); struct metapath mp = { .mp_aheight = 1, }; int ret; if (gfs2_is_jdata(ip)) iomap->flags |= IOMAP_F_BUFFER_HEAD; trace_gfs2_iomap_start(ip, pos, length, flags); ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); if (ret) goto out_unlock; switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) { case IOMAP_WRITE: if (flags & IOMAP_DIRECT) { /* * Silently fall back to buffered I/O for stuffed files * or if we've got a hole (see gfs2_file_direct_write). */ if (iomap->type != IOMAP_MAPPED) ret = -ENOTBLK; goto out_unlock; } break; case IOMAP_ZERO: if (iomap->type == IOMAP_HOLE) goto out_unlock; break; default: goto out_unlock; } ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); out_unlock: release_metapath(&mp); trace_gfs2_iomap_end(ip, iomap, ret); return ret; } static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) { case IOMAP_WRITE: if (flags & IOMAP_DIRECT) return 0; break; case IOMAP_ZERO: if (iomap->type == IOMAP_HOLE) return 0; break; default: return 0; } if (!gfs2_is_stuffed(ip)) gfs2_ordered_add_inode(ip); if (inode == sdp->sd_rindex) adjust_fs_space(inode); gfs2_inplace_release(ip); if (ip->i_qadata && ip->i_qadata->qa_qd_num) gfs2_quota_unlock(ip); if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ loff_t hstart = round_up(pos + written, i_blocksize(inode)); loff_t hend = iomap->offset + iomap->length; if (hstart < hend) { truncate_pagecache_range(inode, hstart, hend - 1); punch_hole(ip, hstart, hend - hstart); } } if (unlikely(!written)) return 0; if (iomap->flags & IOMAP_F_SIZE_CHANGED) mark_inode_dirty(inode); set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); return 0; } const struct iomap_ops gfs2_iomap_ops = { .iomap_begin = gfs2_iomap_begin, .iomap_end = gfs2_iomap_end, }; /** * gfs2_block_map - Map one or more blocks of an inode to a disk block * @inode: The inode * @lblock: The logical block number * @bh_map: The bh to be mapped * @create: True if its ok to alloc blocks to satify the request * * The size of the requested mapping is defined in bh_map->b_size. * * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged * when @lblock is not mapped. Sets buffer_mapped(bh_map) and * bh_map->b_size to indicate the size of the mapping when @lblock and * successive blocks are mapped, up to the requested size. * * Sets buffer_boundary() if a read of metadata will be required * before the next block can be mapped. Sets buffer_new() if new * blocks were allocated. * * Returns: errno */ int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh_map, int create) { struct gfs2_inode *ip = GFS2_I(inode); loff_t pos = (loff_t)lblock << inode->i_blkbits; loff_t length = bh_map->b_size; struct iomap iomap = { }; int ret; clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); if (!create) ret = gfs2_iomap_get(inode, pos, length, &iomap); else ret = gfs2_iomap_alloc(inode, pos, length, &iomap); if (ret) goto out; if (iomap.length > bh_map->b_size) { iomap.length = bh_map->b_size; iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY; } if (iomap.addr != IOMAP_NULL_ADDR) map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); bh_map->b_size = iomap.length; if (iomap.flags & IOMAP_F_GFS2_BOUNDARY) set_buffer_boundary(bh_map); if (iomap.flags & IOMAP_F_NEW) set_buffer_new(bh_map); out: trace_gfs2_bmap(ip, bh_map, lblock, create, ret); return ret; } int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, unsigned int *extlen) { unsigned int blkbits = inode->i_blkbits; struct iomap iomap = { }; unsigned int len; int ret; ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits, &iomap); if (ret) return ret; if (iomap.type != IOMAP_MAPPED) return -EIO; *dblock = iomap.addr >> blkbits; len = iomap.length >> blkbits; if (len < *extlen) *extlen = len; return 0; } int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, unsigned int *extlen, bool *new) { unsigned int blkbits = inode->i_blkbits; struct iomap iomap = { }; unsigned int len; int ret; ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits, &iomap); if (ret) return ret; if (iomap.type != IOMAP_MAPPED) return -EIO; *dblock = iomap.addr >> blkbits; len = iomap.length >> blkbits; if (len < *extlen) *extlen = len; *new = iomap.flags & IOMAP_F_NEW; return 0; } /* * NOTE: Never call gfs2_block_zero_range with an open transaction because it * uses iomap write to perform its actions, which begin their own transactions * (iomap_begin, get_folio, etc.) */ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); } #define GFS2_JTRUNC_REVOKES 8192 /** * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files * @inode: The inode being truncated * @oldsize: The original (larger) size * @newsize: The new smaller size * * With jdata files, we have to journal a revoke for each block which is * truncated. As a result, we need to split this into separate transactions * if the number of pages being truncated gets too large. */ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) { struct gfs2_sbd *sdp = GFS2_SB(inode); u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; u64 chunk; int error; while (oldsize != newsize) { struct gfs2_trans *tr; unsigned int offs; chunk = oldsize - newsize; if (chunk > max_chunk) chunk = max_chunk; offs = oldsize & ~PAGE_MASK; if (offs && chunk > PAGE_SIZE) chunk = offs + ((chunk - offs) & PAGE_MASK); truncate_pagecache(inode, oldsize - chunk); oldsize -= chunk; tr = current->journal_info; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) continue; gfs2_trans_end(sdp); error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); if (error) return error; } return 0; } static int trunc_start(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh = NULL; int journaled = gfs2_is_jdata(ip); u64 oldsize = inode->i_size; int error; if (!gfs2_is_stuffed(ip)) { unsigned int blocksize = i_blocksize(inode); unsigned int offs = newsize & (blocksize - 1); if (offs) { error = gfs2_block_zero_range(inode, newsize, blocksize - offs); if (error) return error; } } if (journaled) error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); else error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); else ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; i_size_write(inode, newsize); inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_dinode_out(ip, dibh->b_data); if (journaled) error = gfs2_journaled_truncate(inode, oldsize, newsize); else truncate_pagecache(inode, newsize); out: brelse(dibh); if (current->journal_info) gfs2_trans_end(sdp); return error; } int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, struct iomap *iomap) { struct metapath mp = { .mp_aheight = 1, }; int ret; ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp); release_metapath(&mp); return ret; } int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, struct iomap *iomap) { struct metapath mp = { .mp_aheight = 1, }; int ret; ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp); if (!ret && iomap->type == IOMAP_HOLE) ret = __gfs2_iomap_alloc(inode, iomap, &mp); release_metapath(&mp); return ret; } /** * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein * @ip: inode * @rd_gh: holder of resource group glock * @bh: buffer head to sweep * @start: starting point in bh * @end: end point in bh * @meta: true if bh points to metadata (rather than data) * @btotal: place to keep count of total blocks freed * * We sweep a metadata buffer (provided by the metapath) for blocks we need to * free, and free them all. However, we do it one rgrp at a time. If this * block has references to multiple rgrps, we break it into individual * transactions. This allows other processes to use the rgrps while we're * focused on a single one, for better concurrency / performance. * At every transaction boundary, we rewrite the inode into the journal. * That way the bitmaps are kept consistent with the inode and we can recover * if we're interrupted by power-outages. * * Returns: 0, or return code if an error occurred. * *btotal has the total number of blocks freed */ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, struct buffer_head *bh, __be64 *start, __be64 *end, bool meta, u32 *btotal) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_trans *tr; __be64 *p; int blks_outside_rgrp; u64 bn, bstart, isize_blks; s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */ int ret = 0; bool buf_in_tr = false; /* buffer was added to transaction */ more_rgrps: rgd = NULL; if (gfs2_holder_initialized(rd_gh)) { rgd = gfs2_glock2rgrp(rd_gh->gh_gl); gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); } blks_outside_rgrp = 0; bstart = 0; blen = 0; for (p = start; p < end; p++) { if (!*p) continue; bn = be64_to_cpu(*p); if (rgd) { if (!rgrp_contains_block(rgd, bn)) { blks_outside_rgrp++; continue; } } else { rgd = gfs2_blk2rgrpd(sdp, bn, true); if (unlikely(!rgd)) { ret = -EIO; goto out; } ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, rd_gh); if (ret) goto out; /* Must be done with the rgrp glock held: */ if (gfs2_rs_active(&ip->i_res) && rgd == ip->i_res.rs_rgd) gfs2_rs_deltree(&ip->i_res); } /* The size of our transactions will be unknown until we actually process all the metadata blocks that relate to the rgrp. So we estimate. We know it can't be more than the dinode's i_blocks and we don't want to exceed the journal flush threshold, sd_log_thresh2. */ if (current->journal_info == NULL) { unsigned int jblocks_rqsted, revokes; jblocks_rqsted = rgd->rd_length + RES_DINODE + RES_INDIRECT; isize_blks = gfs2_get_inode_blocks(&ip->i_inode); if (isize_blks > atomic_read(&sdp->sd_log_thresh2)) jblocks_rqsted += atomic_read(&sdp->sd_log_thresh2); else jblocks_rqsted += isize_blks; revokes = jblocks_rqsted; if (meta) revokes += end - start; else if (ip->i_depth) revokes += sdp->sd_inptrs; ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); if (ret) goto out_unlock; down_write(&ip->i_rw_mutex); } /* check if we will exceed the transaction blocks requested */ tr = current->journal_info; if (tr->tr_num_buf_new + RES_STATFS + RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { /* We set blks_outside_rgrp to ensure the loop will be repeated for the same rgrp, but with a new transaction. */ blks_outside_rgrp++; /* This next part is tricky. If the buffer was added to the transaction, we've already set some block pointers to 0, so we better follow through and free them, or we will introduce corruption (so break). This may be impossible, or at least rare, but I decided to cover the case regardless. If the buffer was not added to the transaction (this call), doing so would exceed our transaction size, so we need to end the transaction and start a new one (so goto). */ if (buf_in_tr) break; goto out_unlock; } gfs2_trans_add_meta(ip->i_gl, bh); buf_in_tr = true; *p = 0; if (bstart + blen == bn) { blen++; continue; } if (bstart) { __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); (*btotal) += blen; gfs2_add_inode_blocks(&ip->i_inode, -blen); } bstart = bn; blen = 1; } if (bstart) { __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); (*btotal) += blen; gfs2_add_inode_blocks(&ip->i_inode, -blen); } out_unlock: if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks outside the rgrp we just processed, do it all over again. */ if (current->journal_info) { struct buffer_head *dibh; ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) goto out; /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); buf_in_tr = false; } gfs2_glock_dq_uninit(rd_gh); cond_resched(); goto more_rgrps; } out: return ret; } static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h) { if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0]))) return false; return true; } /** * find_nonnull_ptr - find a non-null pointer given a metapath and height * @sdp: The superblock * @mp: starting metapath * @h: desired height to search * @end_list: See punch_hole(). * @end_aligned: See punch_hole(). * * Assumes the metapath is valid (with buffers) out to height h. * Returns: true if a non-null pointer was found in the metapath buffer * false if all remaining pointers are NULL in the buffer */ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, unsigned int h, __u16 *end_list, unsigned int end_aligned) { struct buffer_head *bh = mp->mp_bh[h]; __be64 *first, *ptr, *end; first = metaptr1(h, mp); ptr = first + mp->mp_list[h]; end = (__be64 *)(bh->b_data + bh->b_size); if (end_list && mp_eq_to_hgt(mp, end_list, h)) { bool keep_end = h < end_aligned; end = first + end_list[h] + keep_end; } while (ptr < end) { if (*ptr) { /* if we have a non-null pointer */ mp->mp_list[h] = ptr - first; h++; if (h < GFS2_MAX_META_HEIGHT) mp->mp_list[h] = 0; return true; } ptr++; } return false; } enum dealloc_states { DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */ DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */ DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */ DEALLOC_DONE = 3, /* process complete */ }; static inline void metapointer_range(struct metapath *mp, int height, __u16 *start_list, unsigned int start_aligned, __u16 *end_list, unsigned int end_aligned, __be64 **start, __be64 **end) { struct buffer_head *bh = mp->mp_bh[height]; __be64 *first; first = metaptr1(height, mp); *start = first; if (mp_eq_to_hgt(mp, start_list, height)) { bool keep_start = height < start_aligned; *start = first + start_list[height] + keep_start; } *end = (__be64 *)(bh->b_data + bh->b_size); if (end_list && mp_eq_to_hgt(mp, end_list, height)) { bool keep_end = height < end_aligned; *end = first + end_list[height] + keep_end; } } static inline bool walk_done(struct gfs2_sbd *sdp, struct metapath *mp, int height, __u16 *end_list, unsigned int end_aligned) { __u16 end; if (end_list) { bool keep_end = height < end_aligned; if (!mp_eq_to_hgt(mp, end_list, height)) return false; end = end_list[height] + keep_end; } else end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs; return mp->mp_list[height] >= end; } /** * punch_hole - deallocate blocks in a file * @ip: inode to truncate * @offset: the start of the hole * @length: the size of the hole (or 0 for truncate) * * Punch a hole into a file or truncate a file at a given position. This * function operates in whole blocks (@offset and @length are rounded * accordingly); partially filled blocks must be cleared otherwise. * * This function works from the bottom up, and from the right to the left. In * other words, it strips off the highest layer (data) before stripping any of * the metadata. Doing it this way is best in case the operation is interrupted * by power failure, etc. The dinode is rewritten in every transaction to * guarantee integrity. */ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u64 maxsize = sdp->sd_heightsize[ip->i_height]; struct metapath mp = {}; struct buffer_head *dibh, *bh; struct gfs2_holder rd_gh; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int bsize = 1 << bsize_shift; u64 lblock = (offset + bsize - 1) >> bsize_shift; __u16 start_list[GFS2_MAX_META_HEIGHT]; __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; unsigned int start_aligned, end_aligned; unsigned int strip_h = ip->i_height - 1; u32 btotal = 0; int ret, state; int mp_h; /* metapath buffers are read in to this height */ u64 prev_bnr = 0; __be64 *start, *end; if (offset + bsize - 1 >= maxsize) { /* * The starting point lies beyond the allocated metadata; * there are no blocks to deallocate. */ return 0; } /* * The start position of the hole is defined by lblock, start_list, and * start_aligned. The end position of the hole is defined by lend, * end_list, and end_aligned. * * start_aligned and end_aligned define down to which height the start * and end positions are aligned to the metadata tree (i.e., the * position is a multiple of the metadata granularity at the height * above). This determines at which heights additional meta pointers * needs to be preserved for the remaining data. */ if (length) { u64 end_offset = offset + length; u64 lend; /* * Clip the end at the maximum file size for the given height: * that's how far the metadata goes; files bigger than that * will have additional layers of indirection. */ if (end_offset > maxsize) end_offset = maxsize; lend = end_offset >> bsize_shift; if (lblock >= lend) return 0; find_metapath(sdp, lend, &mp, ip->i_height); end_list = __end_list; memcpy(end_list, mp.mp_list, sizeof(mp.mp_list)); for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { if (end_list[mp_h]) break; } end_aligned = mp_h; } find_metapath(sdp, lblock, &mp, ip->i_height); memcpy(start_list, mp.mp_list, sizeof(start_list)); for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { if (start_list[mp_h]) break; } start_aligned = mp_h; ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) return ret; mp.mp_bh[0] = dibh; ret = lookup_metapath(ip, &mp); if (ret) goto out_metapath; /* issue read-ahead on metadata */ for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) { metapointer_range(&mp, mp_h, start_list, start_aligned, end_list, end_aligned, &start, &end); gfs2_metapath_ra(ip->i_gl, start, end); } if (mp.mp_aheight == ip->i_height) state = DEALLOC_MP_FULL; /* We have a complete metapath */ else state = DEALLOC_FILL_MP; /* deal with partial metapath */ ret = gfs2_rindex_update(sdp); if (ret) goto out_metapath; ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) goto out_metapath; gfs2_holder_mark_uninitialized(&rd_gh); mp_h = strip_h; while (state != DEALLOC_DONE) { switch (state) { /* Truncate a full metapath at the given strip height. * Note that strip_h == mp_h in order to be in this state. */ case DEALLOC_MP_FULL: bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, prev_bnr != bh->b_blocknr)) { fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u, " "s_h:%u, mp_h:%u\n", (unsigned long long)ip->i_no_addr, prev_bnr, ip->i_height, strip_h, mp_h); } prev_bnr = bh->b_blocknr; if (gfs2_metatype_check(sdp, bh, (mp_h ? GFS2_METATYPE_IN : GFS2_METATYPE_DI))) { ret = -EIO; goto out; } /* * Below, passing end_aligned as 0 gives us the * metapointer range excluding the end point: the end * point is the first metapath we must not deallocate! */ metapointer_range(&mp, mp_h, start_list, start_aligned, end_list, 0 /* end_aligned */, &start, &end); ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h], start, end, mp_h != ip->i_height - 1, &btotal); /* If we hit an error or just swept dinode buffer, just exit. */ if (ret || !mp_h) { state = DEALLOC_DONE; break; } state = DEALLOC_MP_LOWER; break; /* lower the metapath strip height */ case DEALLOC_MP_LOWER: /* We're done with the current buffer, so release it, unless it's the dinode buffer. Then back up to the previous pointer. */ if (mp_h) { brelse(mp.mp_bh[mp_h]); mp.mp_bh[mp_h] = NULL; } /* If we can't get any lower in height, we've stripped off all we can. Next step is to back up and start stripping the previous level of metadata. */ if (mp_h == 0) { strip_h--; memcpy(mp.mp_list, start_list, sizeof(start_list)); mp_h = strip_h; state = DEALLOC_FILL_MP; break; } mp.mp_list[mp_h] = 0; mp_h--; /* search one metadata height down */ mp.mp_list[mp_h]++; if (walk_done(sdp, &mp, mp_h, end_list, end_aligned)) break; /* Here we've found a part of the metapath that is not * allocated. We need to search at that height for the * next non-null pointer. */ if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) { state = DEALLOC_FILL_MP; mp_h++; } /* No more non-null pointers at this height. Back up to the previous height and try again. */ break; /* loop around in the same state */ /* Fill the metapath with buffers to the given height. */ case DEALLOC_FILL_MP: /* Fill the buffers out to the current height. */ ret = fillup_metapath(ip, &mp, mp_h); if (ret < 0) goto out; /* On the first pass, issue read-ahead on metadata. */ if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) { unsigned int height = mp.mp_aheight - 1; /* No read-ahead for data blocks. */ if (mp.mp_aheight - 1 == strip_h) height--; for (; height >= mp.mp_aheight - ret; height--) { metapointer_range(&mp, height, start_list, start_aligned, end_list, end_aligned, &start, &end); gfs2_metapath_ra(ip->i_gl, start, end); } } /* If buffers found for the entire strip height */ if (mp.mp_aheight - 1 == strip_h) { state = DEALLOC_MP_FULL; break; } if (mp.mp_aheight < ip->i_height) /* We have a partial height */ mp_h = mp.mp_aheight - 1; /* If we find a non-null block pointer, crawl a bit higher up in the metapath and try again, otherwise we need to look lower for a new starting point. */ if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) mp_h++; else state = DEALLOC_MP_LOWER; break; } } if (btotal) { if (current->journal_info == NULL) { ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (ret) goto out; down_write(&ip->i_rw_mutex); } gfs2_statfs_change(sdp, 0, +btotal, 0); gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); } out: if (gfs2_holder_initialized(&rd_gh)) gfs2_glock_dq_uninit(&rd_gh); if (current->journal_info) { up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); cond_resched(); } gfs2_quota_unhold(ip); out_metapath: release_metapath(&mp); return ret; } static int trunc_end(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; int error; error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; down_write(&ip->i_rw_mutex); error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; if (!i_size_read(&ip->i_inode)) { ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); gfs2_ordered_del_inode(ip); } inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); out: up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); return error; } /** * do_shrink - make a file smaller * @inode: the inode * @newsize: the size to make the file * * Called with an exclusive lock on @inode. The @size must * be equal to or smaller than the current inode size. * * Returns: errno */ static int do_shrink(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); int error; error = trunc_start(inode, newsize); if (error < 0) return error; if (gfs2_is_stuffed(ip)) return 0; error = punch_hole(ip, newsize, 0); if (error == 0) error = trunc_end(ip); return error; } /** * do_grow - Touch and update inode size * @inode: The inode * @size: The new size * * This function updates the timestamps on the inode and * may also increase the size of the inode. This function * must not be called with @size any smaller than the current * inode size. * * Although it is not strictly required to unstuff files here, * earlier versions of GFS2 have a bug in the stuffed file reading * code which will result in a buffer overrun if the size is larger * than the max stuffed file size. In order to prevent this from * occurring, such files are unstuffed, but in other cases we can * just update the inode size directly. * * Returns: 0 on success, or -ve on error */ static int do_grow(struct inode *inode, u64 size) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .target = 1, }; struct buffer_head *dibh; int error; int unstuff = 0; if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) { error = gfs2_quota_lock_check(ip, &ap); if (error) return error; error = gfs2_inplace_reserve(ip, &ap); if (error) goto do_grow_qunlock; unstuff = 1; } error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + (unstuff && gfs2_is_jdata(ip) ? RES_JDATA : 0) + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? 0 : RES_QUOTA), 0); if (error) goto do_grow_release; if (unstuff) { error = gfs2_unstuff_dinode(ip); if (error) goto do_end_trans; } error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto do_end_trans; truncate_setsize(inode, size); inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); do_end_trans: gfs2_trans_end(sdp); do_grow_release: if (unstuff) { gfs2_inplace_release(ip); do_grow_qunlock: gfs2_quota_unlock(ip); } return error; } /** * gfs2_setattr_size - make a file a given size * @inode: the inode * @newsize: the size to make the file * * The file size can grow, shrink, or stay the same size. This * is called holding i_rwsem and an exclusive glock on the inode * in question. * * Returns: errno */ int gfs2_setattr_size(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); int ret; BUG_ON(!S_ISREG(inode->i_mode)); ret = inode_newsize_ok(inode, newsize); if (ret) return ret; inode_dio_wait(inode); ret = gfs2_qa_get(ip); if (ret) goto out; if (newsize >= inode->i_size) { ret = do_grow(inode, newsize); goto out; } ret = do_shrink(inode, newsize); out: gfs2_rs_delete(ip); gfs2_qa_put(ip); return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; error = punch_hole(ip, i_size_read(&ip->i_inode), 0); if (!error) error = trunc_end(ip); return error; } int gfs2_file_dealloc(struct gfs2_inode *ip) { return punch_hole(ip, 0, 0); } /** * gfs2_free_journal_extents - Free cached journal bmap info * @jd: The journal * */ void gfs2_free_journal_extents(struct gfs2_jdesc *jd) { struct gfs2_journal_extent *jext; while(!list_empty(&jd->extent_list)) { jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list); list_del(&jext->list); kfree(jext); } } /** * gfs2_add_jextent - Add or merge a new extent to extent cache * @jd: The journal descriptor * @lblock: The logical block at start of new extent * @dblock: The physical block at start of new extent * @blocks: Size of extent in fs blocks * * Returns: 0 on success or -ENOMEM */ static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks) { struct gfs2_journal_extent *jext; if (!list_empty(&jd->extent_list)) { jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list); if ((jext->dblock + jext->blocks) == dblock) { jext->blocks += blocks; return 0; } } jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS); if (jext == NULL) return -ENOMEM; jext->dblock = dblock; jext->lblock = lblock; jext->blocks = blocks; list_add_tail(&jext->list, &jd->extent_list); jd->nr_extents++; return 0; } /** * gfs2_map_journal_extents - Cache journal bmap info * @sdp: The super block * @jd: The journal to map * * Create a reusable "extent" mapping from all logical * blocks to all physical blocks for the given journal. This will save * us time when writing journal blocks. Most journals will have only one * extent that maps all their logical blocks. That's because gfs2.mkfs * arranges the journal blocks sequentially to maximize performance. * So the extent would map the first block for the entire file length. * However, gfs2_jadd can happen while file activity is happening, so * those journals may not be sequential. Less likely is the case where * the users created their own journals by mounting the metafs and * laying it out. But it's still possible. These journals might have * several extents. * * Returns: 0 on success, or error on failure */ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) { u64 lblock = 0; u64 lblock_stop; struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct buffer_head bh; unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; ktime_t start, end; start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; WARN_ON(!list_empty(&jd->extent_list)); do { bh.b_state = 0; bh.b_blocknr = 0; bh.b_size = size; rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0); if (rc || !buffer_mapped(&bh)) goto fail; rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift); if (rc) goto fail; size -= bh.b_size; lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); end = ktime_get(); fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n", rc, jd->jd_jid, (unsigned long long)(i_size_read(jd->jd_inode) - size), jd->nr_extents); fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n", rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr, bh.b_state, (unsigned long long)bh.b_size); gfs2_free_journal_extents(jd); return rc; } /** * gfs2_write_alloc_required - figure out if a write will require an allocation * @ip: the file being written to * @offset: the offset to write to * @len: the number of bytes being written * * Returns: 1 if an alloc is required, 0 otherwise */ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head bh; unsigned int shift; u64 lblock, lblock_stop, size; u64 end_of_file; if (!len) return 0; if (gfs2_is_stuffed(ip)) { if (offset + len > gfs2_max_stuffed_size(ip)) return 1; return 0; } shift = sdp->sd_sb.sb_bsize_shift; BUG_ON(gfs2_is_dir(ip)); end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex)) return 1; size = (lblock_stop - lblock) << shift; do { bh.b_state = 0; bh.b_size = size; gfs2_block_map(&ip->i_inode, lblock, &bh, 0); if (!buffer_mapped(&bh)) return 1; size -= bh.b_size; lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); return 0; } static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *dibh; int error; if (offset >= inode->i_size) return 0; if (offset + length > inode->i_size) length = inode->i_size - offset; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) return error; gfs2_trans_add_meta(ip->i_gl, dibh); memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0, length); brelse(dibh); return 0; } static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset, loff_t length) { struct gfs2_sbd *sdp = GFS2_SB(inode); loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; int error; while (length) { struct gfs2_trans *tr; loff_t chunk; unsigned int offs; chunk = length; if (chunk > max_chunk) chunk = max_chunk; offs = offset & ~PAGE_MASK; if (offs && chunk > PAGE_SIZE) chunk = offs + ((chunk - offs) & PAGE_MASK); truncate_pagecache_range(inode, offset, chunk); offset += chunk; length -= chunk; tr = current->journal_info; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) continue; gfs2_trans_end(sdp); error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); if (error) return error; } return 0; } int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocksize = i_blocksize(inode); loff_t start, end; int error; if (!gfs2_is_stuffed(ip)) { unsigned int start_off, end_len; start_off = offset & (blocksize - 1); end_len = (offset + length) & (blocksize - 1); if (start_off) { unsigned int len = length; if (length > blocksize - start_off) len = blocksize - start_off; error = gfs2_block_zero_range(inode, offset, len); if (error) goto out; if (start_off + length < blocksize) end_len = 0; } if (end_len) { error = gfs2_block_zero_range(inode, offset + length - end_len, end_len); if (error) goto out; } } start = round_down(offset, blocksize); end = round_up(offset + length, blocksize) - 1; error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; if (gfs2_is_jdata(ip)) error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, GFS2_JTRUNC_REVOKES); else error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; if (gfs2_is_stuffed(ip)) { error = stuffed_zero_range(inode, offset, length); if (error) goto out; } if (gfs2_is_jdata(ip)) { BUG_ON(!current->journal_info); gfs2_journaled_truncate_range(inode, offset, length); } else truncate_pagecache_range(inode, offset, offset + length - 1); file_update_time(file); mark_inode_dirty(inode); if (current->journal_info) gfs2_trans_end(sdp); if (!gfs2_is_stuffed(ip)) error = punch_hole(ip, offset, length); out: if (current->journal_info) gfs2_trans_end(sdp); return error; } static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned int len) { int ret; if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) return -EIO; if (offset >= wpc->iomap.offset && offset < wpc->iomap.offset + wpc->iomap.length) return 0; memset(&wpc->iomap, 0, sizeof(wpc->iomap)); ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap); return ret; } const struct iomap_writeback_ops gfs2_writeback_ops = { .map_blocks = gfs2_map_blocks, };
69737 13028 11804 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H #define HAVE_JUMP_LABEL_BATCH #include <asm/asm.h> #include <asm/nops.h> #ifndef __ASSEMBLY__ #include <linux/stringify.h> #include <linux/types.h> #define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\" \n\t" \ _ASM_ALIGN "\n\t" \ ".long 1b - . \n\t" \ ".long " label " - . \n\t" \ _ASM_PTR " " key " - . \n\t" \ ".popsection \n\t" /* This macro is also expanded on the Rust side. */ #ifdef CONFIG_HAVE_JUMP_LABEL_HACK #define ARCH_STATIC_BRANCH_ASM(key, label) \ "1: jmp " label " # objtool NOPs this \n\t" \ JUMP_TABLE_ENTRY(key " + 2", label) #else /* !CONFIG_HAVE_JUMP_LABEL_HACK */ #define ARCH_STATIC_BRANCH_ASM(key, label) \ "1: .byte " __stringify(BYTES_NOP5) "\n\t" \ JUMP_TABLE_ENTRY(key, label) #endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm goto(ARCH_STATIC_BRANCH_ASM("%c0 + %c1", "%l[l_yes]") : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; } static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY("%c0 + %c1", "%l[l_yes]") : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; } extern int arch_jump_entry_size(struct jump_entry *entry); #endif /* __ASSEMBLY__ */ #endif
21 22 22 2 8 23 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * RNG operations. * * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/rng.h> #include <linux/atomic.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" static DEFINE_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); static int crypto_default_rng_refcnt; int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { u8 *buf = NULL; int err; if (!seed && slen) { buf = kmalloc(slen, GFP_KERNEL); if (!buf) return -ENOMEM; err = get_random_bytes_wait(buf, slen); if (err) goto out; seed = buf; } err = crypto_rng_alg(tfm)->seed(tfm, seed, slen); out: kfree_sensitive(buf); return err; } EXPORT_SYMBOL_GPL(crypto_rng_reset); static int crypto_rng_init_tfm(struct crypto_tfm *tfm) { return 0; } static unsigned int seedsize(struct crypto_alg *alg) { struct rng_alg *ralg = container_of(alg, struct rng_alg, base); return ralg->seedsize; } static int __maybe_unused crypto_rng_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_rng rrng; memset(&rrng, 0, sizeof(rrng)); strscpy(rrng.type, "rng", sizeof(rrng.type)); rrng.seedsize = seedsize(alg); return nla_put(skb, CRYPTOCFGA_REPORT_RNG, sizeof(rrng), &rrng); } static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) { seq_printf(m, "type : rng\n"); seq_printf(m, "seedsize : %u\n", seedsize(alg)); } static const struct crypto_type crypto_rng_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_rng_init_tfm, #ifdef CONFIG_PROC_FS .show = crypto_rng_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_rng_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_RNG, .tfmsize = offsetof(struct crypto_rng, base), }; struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_rng_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_rng); int crypto_get_default_rng(void) { struct crypto_rng *rng; int err; mutex_lock(&crypto_default_rng_lock); if (!crypto_default_rng) { rng = crypto_alloc_rng("stdrng", 0, 0); err = PTR_ERR(rng); if (IS_ERR(rng)) goto unlock; err = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng)); if (err) { crypto_free_rng(rng); goto unlock; } crypto_default_rng = rng; } crypto_default_rng_refcnt++; err = 0; unlock: mutex_unlock(&crypto_default_rng_lock); return err; } EXPORT_SYMBOL_GPL(crypto_get_default_rng); void crypto_put_default_rng(void) { mutex_lock(&crypto_default_rng_lock); crypto_default_rng_refcnt--; mutex_unlock(&crypto_default_rng_lock); } EXPORT_SYMBOL_GPL(crypto_put_default_rng); #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE) int crypto_del_default_rng(void) { int err = -EBUSY; mutex_lock(&crypto_default_rng_lock); if (crypto_default_rng_refcnt) goto out; crypto_free_rng(crypto_default_rng); crypto_default_rng = NULL; err = 0; out: mutex_unlock(&crypto_default_rng_lock); return err; } EXPORT_SYMBOL_GPL(crypto_del_default_rng); #endif int crypto_register_rng(struct rng_alg *alg) { struct crypto_alg *base = &alg->base; if (alg->seedsize > PAGE_SIZE / 8) return -EINVAL; base->cra_type = &crypto_rng_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_RNG; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_rng); void crypto_unregister_rng(struct rng_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_rng); int crypto_register_rngs(struct rng_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_rng(algs + i); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_rng(algs + i); return ret; } EXPORT_SYMBOL_GPL(crypto_register_rngs); void crypto_unregister_rngs(struct rng_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_rng(algs + i); } EXPORT_SYMBOL_GPL(crypto_unregister_rngs); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Random Number Generator");
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: LGPL-2.1-or-later /* * Linux driver for digital TV devices equipped with B2C2 FlexcopII(b)/III * flexcop.c - main module part * Copyright (C) 2004-9 Patrick Boettcher <patrick.boettcher@posteo.de> * based on skystar2-driver Copyright (C) 2003 Vadim Catana, skystar@moldova.cc * * Acknowledgements: * John Jurrius from BBTI, Inc. for extensive support * with code examples and data books * Bjarne Steinsbo, bjarne at steinsbo.com (some ideas for rewriting) * * Contributions to the skystar2-driver have been done by * Vincenzo Di Massa, hawk.it at tiscalinet.it (several DiSEqC fixes) * Roberto Ragusa, r.ragusa at libero.it (polishing, restyling the code) * Uwe Bugla, uwe.bugla at gmx.de (doing tests, restyling code, writing docu) * Niklas Peinecke, peinecke at gdv.uni-hannover.de (hardware pid/mac * filtering) */ #include "flexcop.h" #define DRIVER_NAME "B2C2 FlexcopII/II(b)/III digital TV receiver chip" #define DRIVER_AUTHOR "Patrick Boettcher <patrick.boettcher@posteo.de" #ifdef CONFIG_DVB_B2C2_FLEXCOP_DEBUG #define DEBSTATUS "" #else #define DEBSTATUS " (debugging is not enabled)" #endif int b2c2_flexcop_debug; EXPORT_SYMBOL_GPL(b2c2_flexcop_debug); module_param_named(debug, b2c2_flexcop_debug, int, 0644); MODULE_PARM_DESC(debug, "set debug level (1=info,2=tuner,4=i2c,8=ts,16=sram,32=reg,64=i2cdump (|-able))." DEBSTATUS); #undef DEBSTATUS DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); /* global zero for ibi values */ flexcop_ibi_value ibi_zero; static int flexcop_dvb_start_feed(struct dvb_demux_feed *dvbdmxfeed) { struct flexcop_device *fc = dvbdmxfeed->demux->priv; return flexcop_pid_feed_control(fc, dvbdmxfeed, 1); } static int flexcop_dvb_stop_feed(struct dvb_demux_feed *dvbdmxfeed) { struct flexcop_device *fc = dvbdmxfeed->demux->priv; return flexcop_pid_feed_control(fc, dvbdmxfeed, 0); } static int flexcop_dvb_init(struct flexcop_device *fc) { int ret = dvb_register_adapter(&fc->dvb_adapter, "FlexCop Digital TV device", fc->owner, fc->dev, adapter_nr); if (ret < 0) { err("error registering DVB adapter"); return ret; } fc->dvb_adapter.priv = fc; fc->demux.dmx.capabilities = (DMX_TS_FILTERING | DMX_SECTION_FILTERING | DMX_MEMORY_BASED_FILTERING); fc->demux.priv = fc; fc->demux.filternum = fc->demux.feednum = FC_MAX_FEED; fc->demux.start_feed = flexcop_dvb_start_feed; fc->demux.stop_feed = flexcop_dvb_stop_feed; fc->demux.write_to_decoder = NULL; ret = dvb_dmx_init(&fc->demux); if (ret < 0) { err("dvb_dmx failed: error %d", ret); goto err_dmx; } fc->hw_frontend.source = DMX_FRONTEND_0; fc->dmxdev.filternum = fc->demux.feednum; fc->dmxdev.demux = &fc->demux.dmx; fc->dmxdev.capabilities = 0; ret = dvb_dmxdev_init(&fc->dmxdev, &fc->dvb_adapter); if (ret < 0) { err("dvb_dmxdev_init failed: error %d", ret); goto err_dmx_dev; } ret = fc->demux.dmx.add_frontend(&fc->demux.dmx, &fc->hw_frontend); if (ret < 0) { err("adding hw_frontend to dmx failed: error %d", ret); goto err_dmx_add_hw_frontend; } fc->mem_frontend.source = DMX_MEMORY_FE; ret = fc->demux.dmx.add_frontend(&fc->demux.dmx, &fc->mem_frontend); if (ret < 0) { err("adding mem_frontend to dmx failed: error %d", ret); goto err_dmx_add_mem_frontend; } ret = fc->demux.dmx.connect_frontend(&fc->demux.dmx, &fc->hw_frontend); if (ret < 0) { err("connect frontend failed: error %d", ret); goto err_connect_frontend; } ret = dvb_net_init(&fc->dvb_adapter, &fc->dvbnet, &fc->demux.dmx); if (ret < 0) { err("dvb_net_init failed: error %d", ret); goto err_net; } fc->init_state |= FC_STATE_DVB_INIT; return 0; err_net: fc->demux.dmx.disconnect_frontend(&fc->demux.dmx); err_connect_frontend: fc->demux.dmx.remove_frontend(&fc->demux.dmx, &fc->mem_frontend); err_dmx_add_mem_frontend: fc->demux.dmx.remove_frontend(&fc->demux.dmx, &fc->hw_frontend); err_dmx_add_hw_frontend: dvb_dmxdev_release(&fc->dmxdev); err_dmx_dev: dvb_dmx_release(&fc->demux); err_dmx: dvb_unregister_adapter(&fc->dvb_adapter); return ret; } static void flexcop_dvb_exit(struct flexcop_device *fc) { if (fc->init_state & FC_STATE_DVB_INIT) { dvb_net_release(&fc->dvbnet); fc->demux.dmx.close(&fc->demux.dmx); fc->demux.dmx.remove_frontend(&fc->demux.dmx, &fc->mem_frontend); fc->demux.dmx.remove_frontend(&fc->demux.dmx, &fc->hw_frontend); dvb_dmxdev_release(&fc->dmxdev); dvb_dmx_release(&fc->demux); dvb_unregister_adapter(&fc->dvb_adapter); deb_info("deinitialized dvb stuff\n"); } fc->init_state &= ~FC_STATE_DVB_INIT; } /* these methods are necessary to achieve the long-term-goal of hiding the * struct flexcop_device from the bus-parts */ void flexcop_pass_dmx_data(struct flexcop_device *fc, u8 *buf, u32 len) { dvb_dmx_swfilter(&fc->demux, buf, len); } EXPORT_SYMBOL(flexcop_pass_dmx_data); void flexcop_pass_dmx_packets(struct flexcop_device *fc, u8 *buf, u32 no) { dvb_dmx_swfilter_packets(&fc->demux, buf, no); } EXPORT_SYMBOL(flexcop_pass_dmx_packets); static void flexcop_reset(struct flexcop_device *fc) { flexcop_ibi_value v210, v204; /* reset the flexcop itself */ fc->write_ibi_reg(fc,ctrl_208,ibi_zero); v210.raw = 0; v210.sw_reset_210.reset_block_000 = 1; v210.sw_reset_210.reset_block_100 = 1; v210.sw_reset_210.reset_block_200 = 1; v210.sw_reset_210.reset_block_300 = 1; v210.sw_reset_210.reset_block_400 = 1; v210.sw_reset_210.reset_block_500 = 1; v210.sw_reset_210.reset_block_600 = 1; v210.sw_reset_210.reset_block_700 = 1; v210.sw_reset_210.Block_reset_enable = 0xb2; v210.sw_reset_210.Special_controls = 0xc259; fc->write_ibi_reg(fc,sw_reset_210,v210); msleep(1); /* reset the periphical devices */ v204 = fc->read_ibi_reg(fc,misc_204); v204.misc_204.Per_reset_sig = 0; fc->write_ibi_reg(fc,misc_204,v204); msleep(1); v204.misc_204.Per_reset_sig = 1; fc->write_ibi_reg(fc,misc_204,v204); } void flexcop_reset_block_300(struct flexcop_device *fc) { flexcop_ibi_value v208_save = fc->read_ibi_reg(fc, ctrl_208), v210 = fc->read_ibi_reg(fc, sw_reset_210); deb_rdump("208: %08x, 210: %08x\n", v208_save.raw, v210.raw); fc->write_ibi_reg(fc,ctrl_208,ibi_zero); v210.sw_reset_210.reset_block_300 = 1; v210.sw_reset_210.Block_reset_enable = 0xb2; fc->write_ibi_reg(fc,sw_reset_210,v210); fc->write_ibi_reg(fc,ctrl_208,v208_save); } struct flexcop_device *flexcop_device_kmalloc(size_t bus_specific_len) { void *bus; struct flexcop_device *fc = kzalloc(sizeof(struct flexcop_device), GFP_KERNEL); if (!fc) { err("no memory"); return NULL; } bus = kzalloc(bus_specific_len, GFP_KERNEL); if (!bus) { err("no memory"); kfree(fc); return NULL; } fc->bus_specific = bus; return fc; } EXPORT_SYMBOL(flexcop_device_kmalloc); void flexcop_device_kfree(struct flexcop_device *fc) { kfree(fc->bus_specific); kfree(fc); } EXPORT_SYMBOL(flexcop_device_kfree); int flexcop_device_initialize(struct flexcop_device *fc) { int ret; ibi_zero.raw = 0; flexcop_reset(fc); flexcop_determine_revision(fc); flexcop_sram_init(fc); flexcop_hw_filter_init(fc); flexcop_smc_ctrl(fc, 0); ret = flexcop_dvb_init(fc); if (ret) goto error; /* i2c has to be done before doing EEProm stuff - * because the EEProm is accessed via i2c */ ret = flexcop_i2c_init(fc); if (ret) goto error; /* do the MAC address reading after initializing the dvb_adapter */ if (fc->get_mac_addr(fc, 0) == 0) { u8 *b = fc->dvb_adapter.proposed_mac; info("MAC address = %pM", b); flexcop_set_mac_filter(fc,b); flexcop_mac_filter_ctrl(fc,1); } else warn("reading of MAC address failed.\n"); ret = flexcop_frontend_init(fc); if (ret) goto error; flexcop_device_name(fc,"initialization of","complete"); return 0; error: flexcop_device_exit(fc); return ret; } EXPORT_SYMBOL(flexcop_device_initialize); void flexcop_device_exit(struct flexcop_device *fc) { flexcop_frontend_exit(fc); flexcop_i2c_exit(fc); flexcop_dvb_exit(fc); } EXPORT_SYMBOL(flexcop_device_exit); static int flexcop_module_init(void) { info(DRIVER_NAME " loaded successfully"); return 0; } static void flexcop_module_cleanup(void) { info(DRIVER_NAME " unloaded successfully"); } module_init(flexcop_module_init); module_exit(flexcop_module_cleanup); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_NAME); MODULE_LICENSE("GPL");
7 7 11 11 53 51 4 4 1 1 6 4 9 3 7 2 2 2 2 5 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 // SPDX-License-Identifier: GPL-2.0+ /* * Support for dynamic clock devices * * Copyright (C) 2010 OMICRON electronics GmbH */ #include <linux/device.h> #include <linux/export.h> #include <linux/file.h> #include <linux/posix-clock.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include "posix-timers.h" /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ static struct posix_clock *get_posix_clock(struct file *fp) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = pccontext->clk; down_read(&clk->rwsem); if (!clk->zombie) return clk; up_read(&clk->rwsem); return NULL; } static void put_posix_clock(struct posix_clock *clk) { up_read(&clk->rwsem); } static ssize_t posix_clock_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -EINVAL; if (!clk) return -ENODEV; if (clk->ops.read) err = clk->ops.read(pccontext, fp->f_flags, buf, count); put_posix_clock(clk); return err; } static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); __poll_t result = 0; if (!clk) return EPOLLERR; if (clk->ops.poll) result = clk->ops.poll(pccontext, fp, wait); put_posix_clock(clk); return result; } static long posix_clock_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -ENOTTY; if (!clk) return -ENODEV; if (clk->ops.ioctl) err = clk->ops.ioctl(pccontext, cmd, arg); put_posix_clock(clk); return err; } #ifdef CONFIG_COMPAT static long posix_clock_compat_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -ENOTTY; if (!clk) return -ENODEV; if (clk->ops.ioctl) err = clk->ops.ioctl(pccontext, cmd, arg); put_posix_clock(clk); return err; } #endif static int posix_clock_open(struct inode *inode, struct file *fp) { int err; struct posix_clock *clk = container_of(inode->i_cdev, struct posix_clock, cdev); struct posix_clock_context *pccontext; down_read(&clk->rwsem); if (clk->zombie) { err = -ENODEV; goto out; } pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL); if (!pccontext) { err = -ENOMEM; goto out; } pccontext->clk = clk; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { kfree(pccontext); goto out; } } fp->private_data = pccontext; get_device(clk->dev); err = 0; out: up_read(&clk->rwsem); return err; } static int posix_clock_release(struct inode *inode, struct file *fp) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk; int err = 0; if (!pccontext) return -ENODEV; clk = pccontext->clk; if (clk->ops.release) err = clk->ops.release(pccontext); put_device(clk->dev); kfree(pccontext); fp->private_data = NULL; return err; } static const struct file_operations posix_clock_file_operations = { .owner = THIS_MODULE, .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, #ifdef CONFIG_COMPAT .compat_ioctl = posix_clock_compat_ioctl, #endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); err = cdev_device_add(&clk->cdev, dev); if (err) { pr_err("%s unable to add device %d:%d\n", dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); return err; } clk->cdev.owner = clk->ops.owner; clk->dev = dev; return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); void posix_clock_unregister(struct posix_clock *clk) { cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); struct posix_clock_desc { struct file *fp; struct posix_clock *clk; }; static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) { struct file *fp = fget(clockid_to_fd(id)); int err = -EINVAL; if (!fp) return err; if (fp->f_op->open != posix_clock_open || !fp->private_data) goto out; cd->fp = fp; cd->clk = get_posix_clock(fp); err = cd->clk ? 0 : -ENODEV; out: if (err) fput(fp); return err; } static void put_clock_desc(struct posix_clock_desc *cd) { put_posix_clock(cd->clk); fput(cd->fp); } static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) { struct posix_clock_desc cd; int err; err = get_clock_desc(id, &cd); if (err) return err; if ((cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } if (cd.clk->ops.clock_adjtime) err = cd.clk->ops.clock_adjtime(cd.clk, tx); else err = -EOPNOTSUPP; out: put_clock_desc(&cd); return err; } static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; err = get_clock_desc(id, &cd); if (err) return err; if (cd.clk->ops.clock_gettime) err = cd.clk->ops.clock_gettime(cd.clk, ts); else err = -EOPNOTSUPP; put_clock_desc(&cd); return err; } static int pc_clock_getres(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; err = get_clock_desc(id, &cd); if (err) return err; if (cd.clk->ops.clock_getres) err = cd.clk->ops.clock_getres(cd.clk, ts); else err = -EOPNOTSUPP; put_clock_desc(&cd); return err; } static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) { struct posix_clock_desc cd; int err; if (!timespec64_valid_strict(ts)) return -EINVAL; err = get_clock_desc(id, &cd); if (err) return err; if ((cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else err = -EOPNOTSUPP; out: put_clock_desc(&cd); return err; } const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get_timespec = pc_clock_gettime, .clock_adj = pc_clock_adjtime, };
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 // SPDX-License-Identifier: GPL-2.0-only /* * vDPA bus. * * Copyright (c) 2020, Red Hat. All rights reserved. * Author: Jason Wang <jasowang@redhat.com> * */ #include <linux/module.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/vdpa.h> #include <uapi/linux/vdpa.h> #include <net/genetlink.h> #include <linux/mod_devicetable.h> #include <linux/virtio_ids.h> static LIST_HEAD(mdev_head); /* A global mutex that protects vdpa management device and device level operations. */ static DECLARE_RWSEM(vdpa_dev_lock); static DEFINE_IDA(vdpa_index_ida); void vdpa_set_status(struct vdpa_device *vdev, u8 status) { down_write(&vdev->cf_lock); vdev->config->set_status(vdev, status); up_write(&vdev->cf_lock); } EXPORT_SYMBOL(vdpa_set_status); static struct genl_family vdpa_nl_family; static int vdpa_dev_probe(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); const struct vdpa_config_ops *ops = vdev->config; u32 max_num, min_num = 1; int ret = 0; d->dma_mask = &d->coherent_dma_mask; ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64)); if (ret) return ret; max_num = ops->get_vq_num_max(vdev); if (ops->get_vq_num_min) min_num = ops->get_vq_num_min(vdev); if (max_num < min_num) return -EINVAL; if (drv && drv->probe) ret = drv->probe(vdev); return ret; } static void vdpa_dev_remove(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); if (drv && drv->remove) drv->remove(vdev); } static int vdpa_dev_match(struct device *dev, const struct device_driver *drv) { struct vdpa_device *vdev = dev_to_vdpa(dev); /* Check override first, and if set, only use the named driver */ if (vdev->driver_override) return strcmp(vdev->driver_override, drv->name) == 0; /* Currently devices must be supported by all vDPA bus drivers */ return 1; } static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct vdpa_device *vdev = dev_to_vdpa(dev); int ret; ret = driver_set_override(dev, &vdev->driver_override, buf, count); if (ret) return ret; return count; } static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct vdpa_device *vdev = dev_to_vdpa(dev); ssize_t len; device_lock(dev); len = sysfs_emit(buf, "%s\n", vdev->driver_override); device_unlock(dev); return len; } static DEVICE_ATTR_RW(driver_override); static struct attribute *vdpa_dev_attrs[] = { &dev_attr_driver_override.attr, NULL, }; static const struct attribute_group vdpa_dev_group = { .attrs = vdpa_dev_attrs, }; __ATTRIBUTE_GROUPS(vdpa_dev); static const struct bus_type vdpa_bus = { .name = "vdpa", .dev_groups = vdpa_dev_groups, .match = vdpa_dev_match, .probe = vdpa_dev_probe, .remove = vdpa_dev_remove, }; static void vdpa_release_dev(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); const struct vdpa_config_ops *ops = vdev->config; if (ops->free) ops->free(vdev); ida_free(&vdpa_index_ida, vdev->index); kfree(vdev->driver_override); kfree(vdev); } /** * __vdpa_alloc_device - allocate and initilaize a vDPA device * This allows driver to some prepartion after device is * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. * @use_va: indicate whether virtual address must be used by this device * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * * Return: Returns an error when parent/config/dma_dev is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) { struct vdpa_device *vdev; int err = -EINVAL; if (!config) goto err; if (!!config->dma_map != !!config->dma_unmap) goto err; /* It should only work for the device that use on-chip IOMMU */ if (use_va && !(config->dma_map || config->set_map)) goto err; err = -ENOMEM; vdev = kzalloc(size, GFP_KERNEL); if (!vdev) goto err; err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); if (err < 0) goto err_ida; vdev->dev.bus = &vdpa_bus; vdev->dev.parent = parent; vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; vdev->nas = nas; if (name) err = dev_set_name(&vdev->dev, "%s", name); else err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); if (err) goto err_name; init_rwsem(&vdev->cf_lock); device_initialize(&vdev->dev); return vdev; err_name: ida_free(&vdpa_index_ida, vdev->index); err_ida: kfree(vdev); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(__vdpa_alloc_device); static int vdpa_name_match(struct device *dev, const void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); return (strcmp(dev_name(&vdev->dev), data) == 0); } static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { struct device *dev; vdev->nvqs = nvqs; lockdep_assert_held(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); if (dev) { put_device(dev); return -EEXIST; } return device_add(&vdev->dev); } /** * _vdpa_register_device - register a vDPA device with vdpa lock held * Caller must have a succeed call of vdpa_alloc_device() before. * Caller must invoke this routine in the management device dev_add() * callback after setting up valid mgmtdev for this vdpa device. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add device to vDPA bus */ int _vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { if (!vdev->mdev) return -EINVAL; return __vdpa_register_device(vdev, nvqs); } EXPORT_SYMBOL_GPL(_vdpa_register_device); /** * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add to vDPA bus */ int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { int err; down_write(&vdpa_dev_lock); err = __vdpa_register_device(vdev, nvqs); up_write(&vdpa_dev_lock); return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); /** * _vdpa_unregister_device - unregister a vDPA device * Caller must invoke this routine as part of management device dev_del() * callback. * @vdev: the vdpa device to be unregisted from vDPA bus */ void _vdpa_unregister_device(struct vdpa_device *vdev) { lockdep_assert_held(&vdpa_dev_lock); WARN_ON(!vdev->mdev); device_unregister(&vdev->dev); } EXPORT_SYMBOL_GPL(_vdpa_unregister_device); /** * vdpa_unregister_device - unregister a vDPA device * @vdev: the vdpa device to be unregisted from vDPA bus */ void vdpa_unregister_device(struct vdpa_device *vdev) { down_write(&vdpa_dev_lock); device_unregister(&vdev->dev); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_unregister_device); /** * __vdpa_register_driver - register a vDPA device driver * @drv: the vdpa device driver to be registered * @owner: module owner of the driver * * Return: Returns an err when fail to do the registration */ int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) { drv->driver.bus = &vdpa_bus; drv->driver.owner = owner; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__vdpa_register_driver); /** * vdpa_unregister_driver - unregister a vDPA device driver * @drv: the vdpa device driver to be unregistered */ void vdpa_unregister_driver(struct vdpa_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(vdpa_unregister_driver); /** * vdpa_mgmtdev_register - register a vdpa management device * * @mdev: Pointer to vdpa management device * vdpa_mgmtdev_register() register a vdpa management device which supports * vdpa device management. * Return: Returns 0 on success or failure when required callback ops are not * initialized. */ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) { if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del) return -EINVAL; INIT_LIST_HEAD(&mdev->list); down_write(&vdpa_dev_lock); list_add_tail(&mdev->list, &mdev_head); up_write(&vdpa_dev_lock); return 0; } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); static int vdpa_match_remove(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_mgmt_dev *mdev = vdev->mdev; if (mdev == data) mdev->ops->dev_del(mdev, vdev); return 0; } void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) { down_write(&vdpa_dev_lock); list_del(&mdev->list); /* Filter out all the entries belong to this management device and delete it. */ bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); static void vdpa_get_config_unlocked(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { const struct vdpa_config_ops *ops = vdev->config; /* * Config accesses aren't supposed to trigger before features are set. * If it does happen we assume a legacy guest. */ if (!vdev->features_valid) vdpa_set_features_unlocked(vdev, 0); ops->get_config(vdev, offset, buf, len); } /** * vdpa_get_config - Get one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read to * @len: length of the configuration fields in bytes */ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { down_read(&vdev->cf_lock); vdpa_get_config_unlocked(vdev, offset, buf, len); up_read(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_get_config); /** * vdpa_set_config - Set one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read from * @length: length of the configuration fields in bytes */ void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int length) { down_write(&vdev->cf_lock); vdev->config->set_config(vdev, offset, buf, length); up_write(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_set_config); static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev, const char *busname, const char *devname) { /* Bus name is optional for simulated management device, so ignore the * device with bus if bus attribute is provided. */ if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus)) return false; if (!busname && strcmp(dev_name(mdev->device), devname) == 0) return true; if (busname && (strcmp(mdev->device->bus->name, busname) == 0) && (strcmp(dev_name(mdev->device), devname) == 0)) return true; return false; } static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs) { struct vdpa_mgmt_dev *mdev; const char *busname = NULL; const char *devname; if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]) return ERR_PTR(-EINVAL); devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]); if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]) busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]); list_for_each_entry(mdev, &mdev_head, list) { if (mgmtdev_handle_match(mdev, busname, devname)) return mdev; } return ERR_PTR(-ENODEV); } static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev) { if (mdev->device->bus && nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name)) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device))) return -EMSGSIZE; return 0; } static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev, unsigned int *nclasses) { u64 supported_classes = 0; unsigned int n = 0; for (int i = 0; mdev->id_table[i].device; i++) { if (mdev->id_table[i].device > 63) continue; supported_classes |= BIT_ULL(mdev->id_table[i].device); n++; } if (nclasses) *nclasses = n; return supported_classes; } static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, mdev); if (err) goto msg_err; if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, vdpa_mgmtdev_get_classes(mdev, NULL), VDPA_ATTR_UNSPEC)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u32(msg, VDPA_ATTR_DEV_MGMTDEV_MAX_VQS, mdev->max_supported_vqs)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES, mdev->supported_features, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { up_read(&vdpa_dev_lock); NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); err = PTR_ERR(mdev); goto out; } err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); up_read(&vdpa_dev_lock); if (err) goto out; err = genlmsg_reply(msg, info); return err; out: nlmsg_free(msg); return err; } static int vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_mgmt_dev *mdev; int start = cb->args[0]; int idx = 0; int err; down_read(&vdpa_dev_lock); list_for_each_entry(mdev, &mdev_head, list) { if (idx < start) { idx++; continue; } err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) goto out; idx++; } out: up_read(&vdpa_dev_lock); cb->args[0] = idx; return msg->len; } #define VDPA_DEV_NET_ATTRS_MASK (BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) /* * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for * all 64bit features. If the features are extended beyond 64 bits, or new * "holes" are reserved for other type of features than per-device, this * macro would have to be updated. */ #define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \ ((1ULL << VIRTIO_TRANSPORT_F_START) - 1)) static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_dev_set_config config = {}; struct nlattr **nl_attrs = info->attrs; struct vdpa_mgmt_dev *mdev; unsigned int ncls = 0; const u8 *macaddr; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); memcpy(config.net.mac, macaddr, sizeof(config.net.mac)); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) { config.net.mtu = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]) { config.net.max_vq_pairs = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]); if (!config.net.max_vq_pairs) { NL_SET_ERR_MSG_MOD(info->extack, "At least one pair of VQs is required"); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); } if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) { u64 missing = 0x0ULL; config.device_features = nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC))) missing |= BIT_ULL(VIRTIO_NET_F_MAC); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU))) missing |= BIT_ULL(VIRTIO_NET_F_MTU); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] && config.net.max_vq_pairs > 1 && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ))) missing |= BIT_ULL(VIRTIO_NET_F_MQ); if (missing) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Missing features 0x%llx for provided attributes", missing); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); } /* Skip checking capability if user didn't prefer to configure any * device networking attributes. It is likely that user might have used * a device specific method to configure such attributes or using device * default attributes. */ if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; down_write(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); err = PTR_ERR(mdev); goto err; } if ((config.mask & mdev->config_attr_mask) != config.mask) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Some provided attributes are not supported: 0x%llx", config.mask & ~mdev->config_attr_mask); err = -EOPNOTSUPP; goto err; } classes = vdpa_mgmtdev_get_classes(mdev, &ncls); if (config.mask & VDPA_DEV_NET_ATTRS_MASK && !(classes & BIT_ULL(VIRTIO_ID_NET))) { NL_SET_ERR_MSG_MOD(info->extack, "Network class attributes provided on unsupported management device"); err = -EINVAL; goto err; } if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) && config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) && classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 && config.device_features & VIRTIO_DEVICE_F_MASK) { NL_SET_ERR_MSG_MOD(info->extack, "Management device supports multi-class while device features specified are ambiguous"); err = -EINVAL; goto err; } err = mdev->ops->dev_add(mdev, name, &config); err: up_write(&vdpa_dev_lock); return err; } static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct vdpa_device *vdev; struct device *dev; const char *name; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user"); err = -EINVAL; goto mdev_err; } mdev = vdev->mdev; mdev->ops->dev_del(mdev, vdev); mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u16 max_vq_size; u16 min_vq_size = 1; u32 device_id; u32 vendor_id; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev); if (err) goto msg_err; device_id = vdev->config->get_device_id(vdev); vendor_id = vdev->config->get_vendor_id(vdev); max_vq_size = vdev->config->get_vq_num_max(vdev); if (vdev->config->get_vq_num_min) min_vq_size = vdev->config->get_vq_num_min(vdev); err = -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size)) goto msg_err; genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { err = -EINVAL; goto mdev_err; } err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); err: up_read(&vdpa_dev_lock); nlmsg_free(msg); return err; } struct vdpa_dev_dump_info { struct sk_buff *msg; struct netlink_callback *cb; int start_idx; int idx; }; static int vdpa_dev_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 && (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16); } static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->mtu); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16); } static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0) return 0; return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config->mac), config->mac); } static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->status); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16); } static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_net_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_mac_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_status_config_fill(msg, features_device, &config)) return -EMSGSIZE; return vdpa_dev_net_mq_config_fill(msg, features_device, &config); } static int vdpa_dev_blk_capacity_config_fill(struct sk_buff *msg, const struct virtio_blk_config *config) { u64 val_u64; val_u64 = __virtio64_to_cpu(true, config->capacity); return nla_put_u64_64bit(msg, VDPA_ATTR_DEV_BLK_CFG_CAPACITY, val_u64, VDPA_ATTR_PAD); } static int vdpa_dev_blk_seg_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SIZE_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->size_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SIZE_MAX, val_u32); } /* fill the block size*/ static int vdpa_dev_blk_block_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_BLK_SIZE)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->blk_size); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_BLK_SIZE, val_u32); } static int vdpa_dev_blk_seg_max_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SEG_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->seg_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SEG_MAX, val_u32); } static int vdpa_dev_blk_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_BLK_F_MQ)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->num_queues); return nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_NUM_QUEUES, val_u16); } static int vdpa_dev_blk_topology_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 min_io_size; u32 opt_io_size; if ((features & BIT_ULL(VIRTIO_BLK_F_TOPOLOGY)) == 0) return 0; min_io_size = __virtio16_to_cpu(true, config->min_io_size); opt_io_size = __virtio32_to_cpu(true, config->opt_io_size); if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_PHY_BLK_EXP, config->physical_block_exp)) return -EMSGSIZE; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_ALIGN_OFFSET, config->alignment_offset)) return -EMSGSIZE; if (nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_MIN_IO_SIZE, min_io_size)) return -EMSGSIZE; if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_OPT_IO_SIZE, opt_io_size)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_discard_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_DISCARD)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_discard_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_discard_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEG, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->discard_sector_alignment); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_DISCARD_SEC_ALIGN, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_write_zeroes_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_WRITE_ZEROES)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEG, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_ro_config_fill(struct sk_buff *msg, u64 features) { u8 ro; ro = ((features & BIT_ULL(VIRTIO_BLK_F_RO)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_READ_ONLY, ro)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_flush_config_fill(struct sk_buff *msg, u64 features) { u8 flush; flush = ((features & BIT_ULL(VIRTIO_BLK_F_FLUSH)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_FLUSH, flush)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_blk_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_blk_capacity_config_fill(msg, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_block_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_max_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_mq_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_topology_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_discard_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_write_zeroes_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_ro_config_fill(msg, features_device)) return -EMSGSIZE; if (vdpa_dev_blk_flush_config_fill(msg, features_device)) return -EMSGSIZE; return 0; } static int vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u64 features_driver; u8 status = 0; u32 device_id; void *hdr; int err; down_read(&vdev->cf_lock); hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_CONFIG_GET); if (!hdr) { err = -EMSGSIZE; goto out; } if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto msg_err; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto msg_err; } /* only read driver features after the feature negotiation is done */ status = vdev->config->get_status(vdev); if (status & VIRTIO_CONFIG_S_FEATURES_OK) { features_driver = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } } switch (device_id) { case VIRTIO_ID_NET: err = vdpa_dev_net_config_fill(vdev, msg); break; case VIRTIO_ID_BLOCK: err = vdpa_dev_blk_config_fill(vdev, msg); break; default: err = -EOPNOTSUPP; break; } if (err) goto msg_err; up_read(&vdev->cf_lock); genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); out: up_read(&vdev->cf_lock); return err; } static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { struct virtio_net_config config = {}; u64 features; u8 status; int err; status = vdev->config->get_status(vdev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete"); return -EAGAIN; } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); features = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features, VDPA_ATTR_PAD)) return -EMSGSIZE; err = vdpa_dev_net_mq_config_fill(msg, features, &config); if (err) return err; if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) return -EMSGSIZE; err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack); if (err) return err; return 0; } static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { int err; down_read(&vdev->cf_lock); if (!vdev->config->get_vendor_vq_stats) { err = -EOPNOTSUPP; goto out; } err = vdpa_fill_stats_rec(vdev, msg, info, index); out: up_read(&vdev->cf_lock); return err; } static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { u32 device_id; void *hdr; int err; u32 portid = info->snd_portid; u32 seq = info->snd_seq; u32 flags = 0; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_VSTATS_GET); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto undo_msg; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto undo_msg; } switch (device_id) { case VIRTIO_ID_NET: if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value"); err = -ERANGE; break; } err = vendor_stats_fill(vdev, msg, info, index); break; default: err = -EOPNOTSUPP; break; } genlmsg_end(msg, hdr); return err; undo_msg: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (!err) err = genlmsg_reply(msg, info); mdev_err: put_device(dev); dev_err: up_read(&vdpa_dev_lock); if (err) nlmsg_free(msg); return err; } static int vdpa_dev_net_device_attr_set(struct vdpa_device *vdev, struct genl_info *info) { struct vdpa_dev_set_config set_config = {}; struct vdpa_mgmt_dev *mdev = vdev->mdev; struct nlattr **nl_attrs = info->attrs; const u8 *macaddr; int err = -EOPNOTSUPP; down_write(&vdev->cf_lock); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { set_config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); if (is_valid_ether_addr(macaddr)) { ether_addr_copy(set_config.net.mac, macaddr); if (mdev->ops->dev_set_attr) { err = mdev->ops->dev_set_attr(mdev, vdev, &set_config); } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Operation not supported by the device."); } } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Invalid MAC address"); } } up_write(&vdev->cf_lock); return err; } static int vdpa_nl_cmd_dev_attr_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct device *dev; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } classes = vdpa_mgmtdev_get_classes(vdev->mdev, NULL); if (classes & BIT_ULL(VIRTIO_ID_NET)) { err = vdpa_dev_net_device_attr_set(vdev, info); } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "%s device not supported", name); } mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_config_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; u32 index; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); dev_err: nlmsg_free(msg); up_read(&vdpa_dev_lock); return err; } static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; static const struct genl_ops vdpa_nl_ops[] = { { .cmd = VDPA_CMD_MGMTDEV_GET, .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_NEW, .doit = vdpa_nl_cmd_dev_add_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_DEL, .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_GET, .doit = vdpa_nl_cmd_dev_get_doit, .dumpit = vdpa_nl_cmd_dev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_CONFIG_GET, .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, { .cmd = VDPA_CMD_DEV_VSTATS_GET, .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_ATTR_SET, .doit = vdpa_nl_cmd_dev_attr_set_doit, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family vdpa_nl_family __ro_after_init = { .name = VDPA_GENL_NAME, .version = VDPA_GENL_VERSION, .maxattr = VDPA_ATTR_MAX, .policy = vdpa_nl_policy, .netnsok = false, .module = THIS_MODULE, .ops = vdpa_nl_ops, .n_ops = ARRAY_SIZE(vdpa_nl_ops), .resv_start_op = VDPA_CMD_DEV_VSTATS_GET + 1, }; static int vdpa_init(void) { int err; err = bus_register(&vdpa_bus); if (err) return err; err = genl_register_family(&vdpa_nl_family); if (err) goto err; return 0; err: bus_unregister(&vdpa_bus); return err; } static void __exit vdpa_exit(void) { genl_unregister_family(&vdpa_nl_family); bus_unregister(&vdpa_bus); ida_destroy(&vdpa_index_ida); } core_initcall(vdpa_init); module_exit(vdpa_exit); MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); MODULE_DESCRIPTION("vDPA bus"); MODULE_LICENSE("GPL v2");
60 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/netfilter.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_seqadj *seqadj; struct nf_ct_seqadj *this_way; if (off == 0) return 0; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); seqadj = nfct_seqadj(ct); this_way = &seqadj->seq[dir]; this_way->offset_before = off; this_way->offset_after = off; return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_ct_seqadj *this_way; if (off == 0) return 0; if (unlikely(!seqadj)) { WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); return 0; } set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); this_way = &seqadj->seq[dir]; if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, ntohl(seq))) { this_way->correction_pos = ntohl(seq); this_way->offset_before = this_way->offset_after; this_way->offset_after += off; } spin_unlock_bh(&ct->lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { const struct tcphdr *th; if (nf_ct_protonum(ct) != IPPROTO_TCP) return; th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); /* Adjust one found SACK option including checksum correction */ static void nf_ct_sack_block_adjust(struct sk_buff *skb, struct tcphdr *tcph, unsigned int sackoff, unsigned int sackend, struct nf_ct_seqadj *seq) { while (sackoff < sackend) { struct tcp_sack_block_wire *sack; __be32 new_start_seq, new_end_seq; sack = (void *)skb->data + sackoff; if (after(ntohl(sack->start_seq) - seq->offset_before, seq->correction_pos)) new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_after); else new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_before); if (after(ntohl(sack->end_seq) - seq->offset_before, seq->correction_pos)) new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_after); else new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_before); pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n", ntohl(sack->start_seq), ntohl(new_start_seq), ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); } } /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { /* Usually: option, length. */ unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: /* no partial options */ if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_SACK && op[1] >= 2+TCPOLEN_SACK_PERBLOCK && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) nf_ct_sack_block_adjust(skb, tcph, optoff + 2, optoff+op[1], &seqadj->seq[!dir]); optoff += op[1]; } } return 1; } /* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct tcphdr *tcph; __be32 newseq, newack; s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; spin_lock_bh(&ct->lock); if (after(ntohl(tcph->seq), this_way->correction_pos)) seqoff = this_way->offset_after; else seqoff = this_way->offset_before; newseq = htonl(ntohl(tcph->seq) + seqoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); pr_debug("Adjusting sequence number from %u->%u\n", ntohl(tcph->seq), ntohl(newseq)); tcph->seq = newseq; if (!tcph->ack) goto out; if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; newack = htonl(ntohl(tcph->ack_seq) - ackoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); return res; } EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way; if (!seqadj) return 0; this_way = &seqadj->seq[dir]; return after(seq, this_way->correction_pos) ? this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
5 242 247 247 45 26 26 78 30 19 19 106 2 5 5 78 71 40 29 31 78 21 38 96 1640 1236 1 1 25 441 27 902 23 890 893 896 887 1 341 180 411 291 895 6 3 3 2 2 9 9 249 249 248 245 248 248 1 249 249 249 5 247 3 3 2 1 2 1 239 238 240 239 198 48 240 8 1 7 7 1 7 240 48 199 242 10 64 14 13 2 2 10 2 4 3 5 1 4 4 2 2 2 2 3 1 4 4 2 14 14 2 1 2 9 2 3 4 1 1 5 1 3 4 2 1 1 1 30 30 30 64 67 7 3 57 4 57 57 53 18 51 14 9 14 47 6 5 51 52 5 57 42 51 6 55 54 20 31 31 31 12 28 28 27 44 1 11 8 1 2 25 18 30 30 40 18 57 56 38 38 7 5 20 6 6 136 1 135 137 136 135 136 2 1 14 14 1 67 38 1748 1596 114 439 421 421 452 440 439 342 124 414 56 425 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 // SPDX-License-Identifier: GPL-2.0-or-later /* * Neighbour Discovery for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Mike Shaver <shaver@ingenia.com> */ /* * Changes: * * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt * of an RA. * Janos Farkas : kmalloc failure checks * Alexey Kuznetsov : state machine reworked * and moved to net/core. * Pekka Savola : RFC2461 validation * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ #define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/sched.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/jhash.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <net/flow.h> #include <net/ip6_checksum.h> #include <net/inet_common.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops ndisc_hh_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, .key_len = sizeof(struct in6_addr), .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, .reachable_time = ND_REACHABLE_TIME, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL_GPL(nd_tbl); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; memset(opt + 2, 0, pad); opt += pad; space -= pad; memcpy(opt+2, data, data_len); data_len += 2; opt += data_len; space -= data_len; if (space > 0) memset(opt, 0, space); } EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); } static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, void *ha, const u8 *ops_data) { ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { int type; if (!cur || !end || cur >= end) return NULL; type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_PREFIX_INFO || opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_6CO || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || opt->nd_opt_type == ND_OPT_PREF64; } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && !ndisc_is_useropt(dev, cur)); return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; if (!nd_opt || opt_len < 0 || !ndopts) return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, "%s: duplicated ND6 option found: type=%d\n", __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFO: ndopts->nd_opts_pi_end = nd_opt; if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; break; #ifdef CONFIG_IPV6_ROUTE_INFO case ND_OPT_ROUTE_INFO: ndopts->nd_opts_ri_end = nd_opt; if (!ndopts->nd_opts_ri) ndopts->nd_opts_ri = nd_opt; break; #endif default: unknown = true; } if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; } else if (unknown) { /* * Unknown options must be silently ignored, * to accommodate future extension to the * protocol. */ ND_PRINTK(2, notice, "%s: ignored unsupported option; type=%d, len=%d\n", __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } return ndopts; } int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; case ARPHRD_IPGRE: return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return ndisc_hashfn(pkey, dev, hash_rnd); } static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) { return neigh_key_eq128(n, pkey); } static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; bool is_multicast = ipv6_addr_is_multicast(addr); in6_dev = in6_dev_get(dev); if (!in6_dev) { return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; ndisc_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); if (dev->flags&IFF_LOOPBACK) neigh->type = RTN_LOCAL; } else if (dev->flags&IFF_POINTOPOINT) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &ndisc_hh_ops; else neigh->ops = &ndisc_generic_ops; if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } in6_dev_put(in6_dev); return 0; } static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return -EINVAL; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); return 0; } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } /* called with rtnl held */ static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device"); return false; } return true; } static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", __func__); return NULL; } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ skb_set_owner_w(skb, sk); return skb; } static void ip6_nd_hdr(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int hop_limit, int len) { struct ipv6hdr *hdr; struct inet6_dev *idev; unsigned tclass; rcu_read_lock(); idev = __in6_dev_get(skb->dev); tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = hop_limit; hdr->saddr = *saddr; hdr->daddr = *daddr; } void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; int err; struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); return; } skb_dst_set(skb, dst); } icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, csum_partial(icmp6h, skb->len, 0)); ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } rcu_read_unlock(); } EXPORT_SYMBOL(ndisc_send_skb); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; struct nd_msg *msg; int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao); in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, &tmpaddr)) return; src_addr = &tmpaddr; } if (!dev->addr_len) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, .icmp6_router = router, .icmp6_solicited = solicited, .icmp6_override = override, }, .target = *solicited_addr, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } static void ndisc_send_unsol_na(struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; idev = in6_dev_get(dev); if (!idev) return; read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { /* skip tentative addresses until dad completes */ if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_OPTIMISTIC)) continue; ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); } read_unlock_bh(&idev->lock); in6_dev_put(idev); } struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce) { int inc_opt = dev->addr_len; struct sk_buff *skb; struct nd_msg *msg; int optlen = 0; if (!saddr) return NULL; if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return NULL; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, }, .target = *solicit, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) { u8 *opt = skb_put(skb, 8); opt[0] = ND_OPT_NONCE; opt[1] = 8 >> 3; memcpy(opt + 2, &nonce, 6); } return skb; } EXPORT_SYMBOL(ndisc_ns_create); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce) { struct in6_addr addr_buf; struct sk_buff *skb; if (!saddr) { if (ipv6_get_lladdr(dev, &addr_buf, (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) return; saddr = &addr_buf; } skb = ndisc_ns_create(dev, solicit, saddr, nonce); if (skb) ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct sk_buff *skb; struct rs_msg *msg; int send_sllao = dev->addr_len; int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * According to section 2.2 of RFC 4429, we must not * send router solicitations with a sllao from * optimistic addresses, but we may send the solicitation * if we don't include the sllao. So here we check * if our address is optimistic, and if so, we * suppress the inclusion of the sllao. */ if (send_sllao) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { send_sllao = 0; } in6_ifa_put(ifp); } else { send_sllao = 0; } } #endif if (send_sllao) optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct rs_msg) { .icmph = { .icmp6_type = NDISC_ROUTER_SOLICITATION, }, }; if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) { /* * "The sender MUST return an ICMP * destination unreachable" */ dst_link_failure(skb); kfree_skb(skb); } /* Called with locked neigh: either read or both */ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { struct in6_addr *saddr = NULL; struct in6_addr mcaddr; struct net_device *dev = neigh->dev; struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { ND_PRINTK(1, dbg, "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } static int pndisc_is_router(const void *pkey, struct net_device *dev) { struct pneigh_entry *n; int ret = -1; read_lock_bh(&nd_tbl.lock); n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) ret = !!(n->flags & NTF_ROUTER); read_unlock_bh(&nd_tbl.lock); return ret; } void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts) { neigh_update(neigh, lladdr, new, flags, 0); /* report ndisc ops about neighbour update */ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); int is_router = -1; SKB_DR(reason); u64 nonce = 0; bool inc; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); return reason; } /* * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return reason; } if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); return reason; } /* RFC2461 7.1.1: * If the IP source address is the unspecified address, * there MUST NOT be source link-layer address option * in the message. */ if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ ND_PRINTK(2, notice, "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", ifp->idev->dev->name, &ifp->addr, np); goto out; } /* * We are colliding with another node * who is doing DAD * so fail our DAD process */ addrconf_dad_failure(skb, ifp); return reason; } else { /* * This is not a dad solicitation. * If we are an optimistic node, * we should respond. * Otherwise, we should ignore it. */ if (!(ifp->flags & IFA_F_OPTIMISTIC)) goto out; } } idev = ifp->idev; } else { struct net *net = dev_net(dev); /* perhaps an address on the master device */ if (netif_is_l3_slave(dev)) { struct net_device *mdev; mdev = netdev_master_upper_dev_get_rcu(dev); if (mdev) { ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); if (ifp) goto have_ifp; } } idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || (READ_ONCE(idev->cnf.forwarding) && (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) || READ_ONCE(idev->cnf.proxy_ndp)) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc && NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response * by a random time between 0 and * MAX_ANYCAST_DELAY_TIME seconds. * (RFC2461) -- yoshfuji */ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); if (n) pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } } else { SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; } } if (is_router < 0) is_router = READ_ONCE(idev->cnf.forwarding); if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } if (inc) NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); else NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); /* * update / create cache entry * for the source address */ neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE, NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); reason = SKB_CONSUMED; } out: if (ifp) in6_ifa_put(ifp); else in6_dev_put(idev); return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) { struct inet6_dev *idev = __in6_dev_get(dev); switch (READ_ONCE(idev->cnf.accept_untracked_na)) { case 0: /* Don't accept untracked na (absent in neighbor cache) */ return 0; case 1: /* Create new entries from na if currently untracked */ return 1; case 2: /* Create new entries from untracked na only if saddr is in the * same subnet as an address configured on the interface that * received the na */ return !!ipv6_chk_prefix(saddr, dev); default: return 0; } } static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; SKB_DR(reason); u8 new_state; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return reason; } /* For some 802.11 wireless deployments (and possibly other networks), * there will be a NA proxy and unsolicitd packets are attacks * and thus should not be accepted. * drop_unsolicited_na takes precedence over accept_untracked_na */ if (!msg->icmph.icmp6_solicited && idev && READ_ONCE(idev->cnf.drop_unsolicited_na)) return reason; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) We should not print the error if NA has been received from loopback - it is just our own unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); /* RFC 9131 updates original Neighbour Discovery RFC 4861. * NAs with Target LL Address option without a corresponding * entry in the neighbour cache can now create a STALE neighbour * cache entry on routers. * * entry accept fwding solicited behaviour * ------- ------ ------ --------- ---------------------- * present X X 0 Set state to STALE * present X X 1 Set state to REACHABLE * absent 0 X X Do nothing * absent 1 0 X Do nothing * absent 1 1 X Add a new STALE entry * * Note that we don't do a (daddr == all-routers-mcast) check. */ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE; if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) { if (accept_untracked_na(dev, saddr)) { neigh = neigh_create(&nd_tbl, &msg->target, dev); new_state = NUD_STALE; } } if (neigh && !IS_ERR(neigh)) { u8 old_flags = neigh->flags; struct net *net = dev_net(dev); if (READ_ONCE(neigh->nud_state) & NUD_FAILED) goto out; /* * Don't update the neighbor cache entry on a proxy NA from * ourselves because either the proxied node is off link or it * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } ndisc_update(dev, neigh, lladdr, new_state, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* * Change: router to host */ rt6_clean_tohost(dev_net(dev), saddr); } reason = SKB_CONSUMED; out: neigh_release(neigh); } return reason; } static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); return reason; } /* Don't accept RS if we're not in router mode */ if (!READ_ONCE(idev->cnf.forwarding)) goto out; /* * Don't update NCE if src = ::; * this implies that the source node has no ip address assigned yet. */ if (ipv6_addr_any(saddr)) goto out; /* Parse ND options */ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) goto out; } neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); reason = SKB_CONSUMED; } out: return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) { struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); struct sk_buff *skb; struct nlmsghdr *nlh; struct nduseroptmsg *ndmsg; struct net *net = dev_net(ra->dev); int err; int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + (opt->nd_opt_len << 3)); size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); skb = nlmsg_new(msg_size, GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); if (!nlh) { goto nla_put_failure; } ndmsg = nlmsg_data(nlh); ndmsg->nduseropt_family = AF_INET6; ndmsg->nduseropt_ifindex = ra->dev->ifindex; ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; struct fib6_table *table; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; struct net *net; SKB_DR(reason); int lifetime; int optlen; __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); ND_PRINTK(2, info, "RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return reason; } if (optlen < 0) return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); return reason; } #endif in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); return reason; } if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #endif if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent * out on this interface. */ in6_dev->if_flags |= IF_RA_RCVD; } /* * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? IF_RA_MANAGED : 0) | (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", __func__, skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); if (lifetime != 0 && lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) { ND_PRINTK(2, info, "RA: router lifetime (%ds) is too short: %s\n", lifetime, skb->dev->name); goto skip_defrtr; } /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); goto skip_defrtr; } #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref)) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } } /* Set default route metric as specified by user */ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric; /* delete the route if lifetime is 0 or if metric needs change */ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) { ip6_del_rt(net, rt, false); rt = NULL; } ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); if (neigh) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev, pref, defrtr_usr_metric, lifetime); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { struct nl_info nlinfo = { .nl_net = net, }; rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } if (rt) { table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); fib6_set_expires(rt, jiffies + (HZ * lifetime)); fib6_add_gc_list(rt); spin_unlock_bh(&table->tb6_lock); } if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 && ra_msg->icmph.icmp6_hop_limit) { if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <= ra_msg->icmph.icmp6_hop_limit) { WRITE_ONCE(in6_dev->cnf.hop_limit, ra_msg->icmph.icmp6_hop_limit); fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } } skip_defrtr: /* * Update Reachable Time and Retrans Timer */ if (in6_dev->nd_parms) { unsigned long rtime = ntohl(ra_msg->retrans_timer); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; if (rtime < HZ/100) rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { NEIGH_VAR_SET(in6_dev->nd_parms, BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } } } skip_linkparms: /* * Process options. */ if (!neigh) neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, skb->dev, 1); if (neigh) { u8 *lladdr = NULL; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { ND_PRINTK(2, warn, "RA: invalid link-layer address length\n"); goto out; } } ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, accept_ra is false for dev: %s\n", __func__, skb->dev->name); goto out; } #ifdef CONFIG_IPV6_ROUTE_INFO if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); goto skip_routeinfo; } if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { struct route_info *ri = (struct route_info *)p; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && ri->prefix_len == 0) continue; #endif if (ri->prefix_len == 0 && !READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) continue; if (ri->lifetime != 0 && ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) continue; if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen)) continue; if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen)) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } skip_routeinfo: #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", __func__, skb->dev->name); goto out; } #endif if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { addrconf_prefix_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, ndopts.nd_opts_src_lladdr != NULL); } } if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) { __be32 n; u32 mtu; memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (in6_dev->ra_mtu != mtu) { in6_dev->ra_mtu = mtu; send_ifinfo_notify = true; } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) { WRITE_ONCE(in6_dev->cnf.mtu6, mtu); fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } if (ndopts.nd_useropts) { struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; p = ndisc_next_useropt(skb->dev, p, ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or * timer settings or ra_mtu value */ if (send_ifinfo_notify) inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); fib6_info_release(rt); if (neigh) neigh_release(neigh); return reason; } static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); struct ndisc_options ndopts; SKB_DR(reason); u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return SKB_DROP_REASON_PKT_TOO_SMALL; return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, struct sk_buff *orig_skb, int rd_len) { u8 *opt = skb_put(skb, rd_len); memset(opt, 0, 8); *(opt++) = ND_OPT_REDIRECT_HDR; *(opt++) = (rd_len >> 3); opt += 6; skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt, rd_len - 8); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; struct sk_buff *buff; struct rd_msg *msg; struct in6_addr saddr_buf; struct rt6_info *rt; struct dst_entry *dst; struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) return; } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: target address is not link-local unicast\n"); return; } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, "Redirect: destination is not a neighbour\n"); goto release; } peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); if (!ret) goto release; if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { ND_PRINTK(2, warn, "Redirect: no neigh for target address\n"); goto release; } read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; optlen += ndisc_redirect_opt_addr_space(dev, neigh, ops_data_buf, &ops_data); } else read_unlock_bh(&neigh->lock); neigh_release(neigh); } rd_len = min_t(unsigned int, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, skb->len + 8); rd_len &= ~0x7; optlen += rd_len; buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!buff) goto release; msg = skb_put(buff, sizeof(*msg)); *msg = (struct rd_msg) { .icmph = { .icmp6_type = NDISC_REDIRECT, }, .target = *target, .dest = ipv6_hdr(skb)->daddr, }; /* * include target_address option */ if (ha) ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. */ if (rd_len) ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) { enum skb_drop_reason reason = ndisc_recv_ns(skb); kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) { return ipv6_addr_is_multicast((struct in6_addr *)pkey); } static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && READ_ONCE(idev->cnf.suppress_frag_ndisc)) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } return false; } enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: reason = ndisc_redirect_rcv(skb); break; } return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); fib6_run_gc(0, net, false); fallthrough; case NETDEV_UP: idev = in6_dev_get(dev); if (!idev) break; if (READ_ONCE(idev->cnf.ndisc_notify) || READ_ONCE(net->ipv6.devconf_all->ndisc_notify)) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; case NETDEV_CHANGE: idev = in6_dev_get(dev); if (!idev) evict_nocarrier = true; else { evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) && READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier); in6_dev_put(idev); } change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); break; case NETDEV_NOTIFY_PEERS: ndisc_send_unsol_na(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block ndisc_netdev_notifier = { .notifier_call = ndisc_netdev_event, .priority = ADDRCONF_NOTIFY_PRIORITY - 5, }; #ifdef CONFIG_SYSCTL static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strscpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); warned++; } } int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; int ret; if ((strcmp(ctl->procname, "retrans_time") == 0) || (strcmp(ctl->procname, "base_reachable_time") == 0)) ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) idev->nd_parms->reachable_time = neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } return ret; } #endif static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; int err; err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { ND_PRINTK(0, err, "NDISC: Failed to initialize the control socket (err %d)\n", err); return err; } net->ipv6.ndisc_sk = sk; np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ inet6_clear_bit(MC6_LOOP, sk); return 0; } static void __net_exit ndisc_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.ndisc_sk); } static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, }; int __init ndisc_init(void) { int err; err = register_pernet_subsys(&ndisc_net_ops); if (err) return err; /* * Initialize the neighbour table */ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl); #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: #endif return err; #ifdef CONFIG_SYSCTL out_unregister_pernet: unregister_pernet_subsys(&ndisc_net_ops); goto out; #endif } int __init ndisc_late_init(void) { return register_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_cleanup(void) { #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl); unregister_pernet_subsys(&ndisc_net_ops); }
7325 31 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif #define PERF_GUEST_ACTIVE 0x01 #define PERF_GUEST_USER 0x02 struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <linux/rhashtable-types.h> #include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; __u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * hw_perf_event::flag values * * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 last_tag; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* aux / Intel-PT */ u64 aux_config; /* * For AUX area events, aux_paused cannot be a state * flag because it can be updated asynchronously to * state. */ unsigned int aux_paused; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif }; struct perf_event; struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 /** * pmu::scope */ enum perf_pmu_scope { PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, PERF_PMU_SCOPE_PKG, PERF_PMU_SCOPE_SYS_WIDE, PERF_PMU_MAX_SCOPE, }; struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; struct module *module; struct device *dev; struct device *parent; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; /* * PMU scope */ unsigned int scope; int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ #define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ #define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. * * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with * PERF_EF_RESUME. * * ->start() with PERF_EF_RESUME will start as simply as possible but * only if the counter is not otherwise stopped. Will not overlap * another ->start() with PERF_EF_RESUME nor ->stop() with * PERF_EF_PAUSE. * * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other * ->stop()/->start() invocations, just not itself. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to 0 (see * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage * examples. */ void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc); /* optional */ /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Skip programming this PMU on the given CPU. Typically needed for * big.LITTLE things. */ bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /** * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; /* * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex * as such iteration must hold either lock. However, since ctx->lock is an IRQ * safe lock, and is only held by the CPU doing the modification, having IRQs * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; /* * event->pmu will always point to pmu in which this event belongs. * Whereas event->pmu_ctx->pmu may point to other pmu when group of * different pmu events is created. */ struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; /* * event->pmu_ctx points to perf_event_pmu_context in which the event * is added. This pmu_ctx can be of other pmu for sw event when that * sw event is part of a group which also contains non-sw events. */ struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; struct rcuwait pending_work_wait; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; struct bpf_prog *prog; u64 bpf_cookie; #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; #endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; /* * Certain events gets forwarded to another pmu internally by over- * writing kernel copy of event->attr.type without user being aware * of it. event->orig_type contains original 'type' requested by * user. */ __u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; /* * ,-----------------------[1:n]------------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event * | | * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed * (similar to perf_event_context). Locking is as if it were a member of * perf_event_context; specifically: * * modification, both: ctx->mutex && ctx->lock * reading, either: ctx->mutex || ctx->lock * * There is one exception to this; namely put_pmu_ctx() isn't always called * with ctx->mutex held; this means that as long as we can guarantee the epc * has events the above rules hold. * * Specificially, sys_perf_event_open()'s group_leader case depends on * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. * * perf_event holds a refcount on perf_event_context * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; struct perf_event_context *ctx; struct list_head pmu_ctx_entry; struct list_head pinned_active; struct list_head flexible_active; /* Used to avoid freeing per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; unsigned int nr_cgroups; unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to * events not necessary to be active due to scheduling constraints, * such as cgroups. */ int rotate_necessary; }; static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) { return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); } struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_user; int is_active; int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif struct rcu_head rcu_head; /* * The count of events for which using the switch-out fast path * should be avoided. * * Sum (event->pending_work + events with * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_no_switch_fast; }; struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; struct list_head sched_cb_entry; int sched_cb_usage; int active_oncpu; int exclusive; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; }; /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern struct perf_event_context *perf_cpu_task_ctx(void); extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); static inline bool branch_sample_no_flags(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; } static inline bool branch_sample_no_cycles(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; } static inline bool branch_sample_type(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; } static inline bool branch_sample_hw_index(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool branch_sample_priv(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } static inline bool branch_sample_counters(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } static inline bool branch_sample_call_stack(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, * group so as to minimize the cachelines touched. */ u64 sample_flags; u64 period; u64 dyn_size; /* * Fields commonly set by __perf_event_header__init_id(), * group so as to minimize the cachelines touched. */ u64 type; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; struct { u32 cpu; u32 reserved; } cpu_entry; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 ip; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 stream_id; u64 cgroup; u64 addr; u64 phys_addr; u64 data_page_size; u64 code_page_size; u64 aux_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA) |\ PERF_MEM_S(LVLNUM, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; data->dyn_size = 0; if (addr) { data->addr = addr; data->sample_flags |= PERF_SAMPLE_ADDR; } } static inline void perf_sample_save_callchain(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { int size = 1; data->callchain = perf_callchain(event, regs); size += data->callchain->nr; data->dyn_size += size * sizeof(u64); data->sample_flags |= PERF_SAMPLE_CALLCHAIN; } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; data->raw = raw; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_RAW; } static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, u64 *brs_cntr) { int size = sizeof(u64); /* nr */ if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); /* * The extension space for counters is appended after the * struct perf_branch_stack. It is used to store the occurrences * of events of each branch. */ if (brs_cntr) size += brs->nr * sizeof(u64); data->br_stack = brs; data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } static inline u32 perf_sample_data_size(struct perf_sample_data *data, struct perf_event *event) { u32 size = sizeof(struct perf_event_header); size += event->header_size + event->id_header_size; size += data->dyn_size; return size; } /* * Clear all bitfields in the perf_branch_entry. * The to and from fields are not cleared because they are * systematically modified by caller. */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { br->mispred = 0; br->predicted = 0; br->in_tx = 0; br->abort = 0; br->cycles = 0; br->type = 0; br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { perf_overflow_handler_t overflow_handler = event->overflow_handler; if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } #endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } int perf_allow_kernel(struct perf_event_attr *attr); static inline int perf_allow_cpu(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs); #ifndef perf_arch_misc_flags # define perf_arch_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_arch_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif #ifndef perf_arch_guest_misc_flags static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) { return event->attr.aux_sample_size || event->attr.aux_pause || event->attr.aux_resume; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* Only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR_SHOW(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ #define PMU_FORMAT_ATTR(_name, _format) \ PMU_FORMAT_ATTR_SHOW(_name, _format) \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS int perf_event_init_cpu(unsigned int cpu); int perf_event_exit_cpu(unsigned int cpu); #else #define perf_event_init_cpu NULL #define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); /* * Snapshot branch stack on software events. * * Branch stack can be very useful in understanding software events. For * example, when a long function, e.g. sys_perf_event_open, returns an * errno, it is not obvious why the function failed. Branch stack could * provide very helpful information in this type of scenarios. * * On software event, it is necessary to stop the hardware branch recorder * fast. Otherwise, the hardware register/buffer will be flushed with * entries of the triggering event. Therefore, static call is used to * stop the hardware recorder. */ /* * cnt is the number of entries allocated for entries. * Return number of entries copied to . */ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); #ifndef PERF_NEEDS_LOPWR_CB static inline void perf_lopwr_cb(bool mode) { } #endif #endif /* _LINUX_PERF_EVENT_H */
549 78 545 21 4 3 3 547 488 5 4 480 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0-or-later /* Key permission checking * * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/security.h> #include "internal.h" /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. * * The caller must hold either a ref on cred or must hold the RCU readlock. * * Returns 0 if successful, -EACCES if access is denied based on the * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; key_perm_t kperm, mask; int ret; switch (need_perm) { default: WARN_ON(1); return -EACCES; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: goto lsm; case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; case KEY_NEED_READ: mask = KEY_OTH_READ; break; case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; } key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ if (uid_eq(key->uid, cred->fsuid)) { kperm = key->perm >> 16; goto use_these_perms; } /* use the third 8-bits of permissions for keys the caller has a group * membership in common with */ if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { if (gid_eq(key->gid, cred->fsgid)) { kperm = key->perm >> 8; goto use_these_perms; } ret = groups_search(cred->group_info, key->gid); if (ret) { kperm = key->perm >> 8; goto use_these_perms; } } /* otherwise use the least-significant 8-bits */ kperm = key->perm; use_these_perms: /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions */ if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ lsm: return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); /** * key_validate - Validate a key. * @key: The key to be validated. * * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the * key is invalidated, -EKEYREVOKED if the key's type has been removed or if * the key has been revoked or -EKEYEXPIRED if the key has expired. */ int key_validate(const struct key *key) { unsigned long flags = READ_ONCE(key->flags); time64_t expiry = READ_ONCE(key->expiry); if (flags & (1 << KEY_FLAG_INVALIDATED)) return -ENOKEY; /* check it's still accessible */ if (flags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))) return -EKEYREVOKED; /* check it hasn't expired */ if (expiry) { if (ktime_get_real_seconds() >= expiry) return -EKEYEXPIRED; } return 0; } EXPORT_SYMBOL(key_validate);
6 1 13 3 3 1 2 25 13 15 4 4 12 6 35 1 10 19 35 51 15 36 2 20 20 1 26 23 4 1 2 5 3 5 4 50 51 4 45 6 48 44 1 29 30 47 1 42 6 2 44 6 42 2 42 13 9 22 25 29 2 5 38 5 35 51 2 1 1 47 41 3 39 55 55 17 55 51 44 2 44 55 6 3 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2018-2019 HUAWEI, Inc. * https://www.huawei.com/ */ #include "internal.h" #include <linux/unaligned.h> #include <trace/events/erofs.h> struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; bool partialref; }; static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; unsigned int advise; di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); if (IS_ERR(di)) return PTR_ERR(di); m->lcn = lcn; m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); } else { m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } m->pblk = le32_to_cpu(di->di_u.blkaddr); } return 0; } static unsigned int decode_compactedbits(unsigned int lobits, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) return d1; ++d1; } while (++i < vcnt); /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */ if (!(lo & Z_EROFS_LI_D0_CBLKCNT)) d1 += lo - 1; return d1; } static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int amortizedshift, erofs_off_t pos, bool lookahead) { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; bool big_pcluster; u8 *in, type; int i; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); if (IS_ERR(in)) return PTR_ERR(in); /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; bytes = pos & ((vcnt << amortizedshift) - 1); in -= bytes; i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); if (lo & Z_EROFS_LI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); return -EFSCORRUPTED; } m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; return 0; } else if (i + 1 != (int)vcnt) { m->delta[0] = lo; return 0; } /* * since the last lcluster in the pack is special, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ lo = decode_compactedbits(lobits, in, encodebits * (i - 1), &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) lo = 0; else if (lo & Z_EROFS_LI_D0_CBLKCNT) lo = 1; m->delta[0] = lo + 1; return 0; } m->clusterofs = lo; m->delta[0] = 0; /* figout out blkaddr (pblk) for HEAD lclusters */ if (!big_pcluster) { nblk = 1; while (i > 0) { --i; lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) i -= lo; if (i >= 0) ++nblk; } } else { nblk = 0; while (i > 0) { --i; lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { if (lo & Z_EROFS_LI_D0_CBLKCNT) { --i; nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT; continue; } /* bigpcluster shouldn't have plain d0 == 1 */ if (lo <= 1) { DBG_BUGON(1); return -EFSCORRUPTED; } i -= lo - 2; continue; } ++nblk; } } in += (vcnt << amortizedshift) - sizeof(__le32); m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; return 0; } static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); unsigned int totalidx = erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; erofs_off_t pos; if (lcn >= totalidx || vi->z_logical_clusterbits > 14) return -EINVAL; m->lcn = lcn; /* used to align to 32-byte (compacted_2b) alignment */ compacted_4b_initial = (32 - ebase % 32) / 4; if (compacted_4b_initial == 32 / 4) compacted_4b_initial = 0; if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && compacted_4b_initial < totalidx) compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); else compacted_2b = 0; pos = ebase; if (lcn < compacted_4b_initial) { amortizedshift = 2; goto out; } pos += compacted_4b_initial * 4; lcn -= compacted_4b_initial; if (lcn < compacted_2b) { amortizedshift = 1; goto out; } pos += compacted_2b * 2; lcn -= compacted_2b; amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, unsigned int lcn, bool lookahead) { switch (EROFS_I(m->inode)->datalayout) { case EROFS_INODE_COMPRESSED_FULL: return z_erofs_load_full_lcluster(m, lcn); case EROFS_INODE_COMPRESSED_COMPACT: return z_erofs_load_compact_lcluster(m, lcn, lookahead); default: return -EINVAL; } } static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned int lookback_distance) { struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; while (m->lcn >= lookback_distance) { unsigned long lcn = m->lcn - lookback_distance; int err; err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; switch (m->type) { case Z_EROFS_LCLUSTER_TYPE_NONHEAD: lookback_distance = m->delta[0]; if (!lookback_distance) goto err_bogus; continue; case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: case Z_EROFS_LCLUSTER_TYPE_HEAD2: m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; default: erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", m->type, lcn, vi->nid); DBG_BUGON(1); return -EOPNOTSUPP; } } err_bogus: erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, unsigned int initial_lcn) { struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); struct erofs_map_blocks *const map = m->map; const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned long lcn; int err; DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN && m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 && m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2); DBG_BUGON(m->type != m->headtype); if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1ULL << lclusterbits; return 0; } lcn = m->lcn + 1; if (m->compressedblks) goto out; err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; /* * If the 1st NONHEAD lcluster has already been handled initially w/o * valid compressedblks, which means at least it mustn't be CBLKCNT, or * an internal implemenatation error is detected. * * The following code can also handle it properly anyway, but let's * BUG_ON in the debugging mode only for developers to notice that. */ DBG_BUGON(lcn == initial_lcn && m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); switch (m->type) { case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: case Z_EROFS_LCLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. */ m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits); break; case Z_EROFS_LCLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; if (m->compressedblks) break; fallthrough; default: erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } out: map->m_plen = erofs_pos(sb, m->compressedblks); return 0; err_bonus_cblkcnt: erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) { struct inode *inode = m->inode; struct erofs_inode *vi = EROFS_I(inode); struct erofs_map_blocks *map = m->map; unsigned int lclusterbits = vi->z_logical_clusterbits; u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; while (1) { /* handle the last EOF pcluster (no next HEAD lcluster) */ if ((lcn << lclusterbits) >= inode->i_size) { map->m_llen = inode->i_size - map->m_la; return 0; } err = z_erofs_load_lcluster_from_disk(m, lcn, true); if (err) return err; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { /* work around invalid d1 generated by pre-1.0 mkfs */ if (unlikely(!m->delta[1])) { m->delta[1] = 1; DBG_BUGON(1); } } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { if (lcn != headlcn) break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; } else { erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", m->type, lcn, vi->nid); DBG_BUGON(1); return -EOPNOTSUPP; } lcn += m->delta[1]; } map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; return 0; } static int z_erofs_do_map_blocks(struct inode *inode, struct erofs_map_blocks *map, int flags) { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, }; int err = 0; unsigned int lclusterbits, endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; lclusterbits = vi->z_logical_clusterbits; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) vi->z_idataoff = m.nextpackoff; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: case Z_EROFS_LCLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; /* * For ztailpacking files, in order to inline data more * effectively, special EOF lclusters are now supported * which can have three parts at most. */ if (ztailpacking && end > inode->i_size) end = inode->i_size; break; } /* m.lcn should be >= 1 if endoff < m.clusterofs */ if (!m.lcn) { erofs_err(inode->i_sb, "invalid logical cluster 0 at nid %llu", vi->nid); err = -EFSCORRUPTED; goto unmap_out; } end = (m.lcn << lclusterbits) | m.clusterofs; map->m_flags |= EROFS_MAP_FULL_MAPPED; m.delta[0] = 1; fallthrough; case Z_EROFS_LCLUSTER_TYPE_NONHEAD: /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; break; default: erofs_err(inode->i_sb, "unknown type %u @ offset %llu of nid %llu", m.type, ofs, vi->nid); err = -EOPNOTSUPP; goto unmap_out; } if (m.partialref) map->m_flags |= EROFS_MAP_PARTIAL_REF; map->m_llen = end - map->m_la; if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_FRAGMENT; } else { map->m_pa = erofs_pos(inode->i_sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; } if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) { if (map->m_llen > map->m_plen) { DBG_BUGON(1); err = -EFSCORRUPTED; goto unmap_out; } afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? Z_EROFS_COMPRESSION_INTERLACED : Z_EROFS_COMPRESSION_SHIFTED; } else { afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", afmt, vi->nid); err = -EFSCORRUPTED; goto unmap_out; } } map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE || map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) && map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; } unmap_out: erofs_unmap_metabuf(&m.map->buf); return err; } static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; int err, headnr; erofs_off_t pos; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct z_erofs_map_header *h; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { /* * paired with smp_mb() at the end of the function to ensure * fields will only be observed after the bit is set. */ smp_mb(); return 0; } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); if (IS_ERR(h)) { err = PTR_ERR(h); goto out_unlock; } /* * if the highest bit of the 8-byte map header is set, the whole file * is stored in the packed inode. The rest bits keeps z_fragmentoff. */ if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); vi->z_tailextent_headlcn = 0; goto done; } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; headnr = 0; if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; goto out_put_metabuf; } vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", vi->nid); err = -EFSCORRUPTED; goto out_put_metabuf; } if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", vi->nid); err = -EFSCORRUPTED; goto out_put_metabuf; } if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; vi->z_idata_size = le16_to_cpu(h->h_idata_size); err = z_erofs_do_map_blocks(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (!map.m_plen || erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { erofs_err(sb, "invalid tail-packing pclustersize %llu", map.m_plen); err = -EFSCORRUPTED; } if (err < 0) goto out_put_metabuf; } if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); err = z_erofs_do_map_blocks(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) goto out_put_metabuf; } done: /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); out_put_metabuf: erofs_put_metabuf(&buf); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; } int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { struct erofs_inode *const vi = EROFS_I(inode); int err = 0; trace_erofs_map_blocks_enter(inode, map, flags); if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */ map->m_llen = map->m_la + 1 - inode->i_size; map->m_la = inode->i_size; map->m_flags = 0; } else { err = z_erofs_fill_inode_lazy(inode); if (!err) { if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && !vi->z_tailextent_headlcn) { map->m_la = 0; map->m_llen = inode->i_size; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; } else { err = z_erofs_do_map_blocks(inode, map, flags); } } if (!err && (map->m_flags & EROFS_MAP_ENCODED) && unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) err = -EOPNOTSUPP; if (err) map->m_llen = 0; } trace_erofs_map_blocks_exit(inode, map, flags, err); return err; } static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct erofs_map_blocks map = { .m_la = offset }; ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); erofs_put_metabuf(&map.buf); if (ret < 0) return ret; iomap->bdev = inode->i_sb->s_bdev; iomap->offset = map.m_la; iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; /* * No strict rule on how to describe extents for post EOF, yet * we need to do like below. Otherwise, iomap itself will get * into an endless loop on post EOF. * * Calculate the effective offset by subtracting extent start * (map.m_la) from the requested offset, and add it to length. * (NB: offset >= map.m_la always) */ if (iomap->offset >= inode->i_size) iomap->length = length + offset - map.m_la; } iomap->flags = 0; return 0; } const struct iomap_ops z_erofs_iomap_report_ops = { .iomap_begin = z_erofs_iomap_begin_report, };
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/string.h> #include <linux/init.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/dmi.h> #include <linux/efi.h> #include <linux/memblock.h> #include <linux/random.h> #include <asm/dmi.h> #include <linux/unaligned.h> #ifndef SMBIOS_ENTRY_POINT_SCAN_START #define SMBIOS_ENTRY_POINT_SCAN_START 0xF0000 #endif struct kobject *dmi_kobj; EXPORT_SYMBOL_GPL(dmi_kobj); /* * DMI stands for "Desktop Management Interface". It is part * of and an antecedent to, SMBIOS, which stands for System * Management BIOS. See further: https://www.dmtf.org/standards */ static const char dmi_empty_string[] = ""; static u32 dmi_ver __initdata; static u32 dmi_len; static u16 dmi_num; static u8 smbios_entry_point[32]; static int smbios_entry_point_size; /* DMI system identification string used during boot */ static char dmi_ids_string[128] __initdata; static struct dmi_memdev_info { const char *device; const char *bank; u64 size; /* bytes */ u16 handle; u8 type; /* DDR2, DDR3, DDR4 etc */ } *dmi_memdev; static int dmi_memdev_nr; static int dmi_memdev_populated_nr __initdata; static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s) { const u8 *bp = ((u8 *) dm) + dm->length; const u8 *nsp; if (s) { while (--s > 0 && *bp) bp += strlen(bp) + 1; /* Strings containing only spaces are considered empty */ nsp = bp; while (*nsp == ' ') nsp++; if (*nsp != '\0') return bp; } return dmi_empty_string; } static const char * __init dmi_string(const struct dmi_header *dm, u8 s) { const char *bp = dmi_string_nosave(dm, s); char *str; size_t len; if (bp == dmi_empty_string) return dmi_empty_string; len = strlen(bp) + 1; str = dmi_alloc(len); if (str != NULL) strcpy(str, bp); return str; } /* * We have to be cautious here. We have seen BIOSes with DMI pointers * pointing to completely the wrong place for example */ static void dmi_decode_table(u8 *buf, void (*decode)(const struct dmi_header *, void *), void *private_data) { u8 *data = buf; int i = 0; /* * Stop when we have seen all the items the table claimed to have * (SMBIOS < 3.0 only) OR we reach an end-of-table marker (SMBIOS * >= 3.0 only) OR we run off the end of the table (should never * happen but sometimes does on bogus implementations.) */ while ((!dmi_num || i < dmi_num) && (data - buf + sizeof(struct dmi_header)) <= dmi_len) { const struct dmi_header *dm = (const struct dmi_header *)data; /* * If a short entry is found (less than 4 bytes), not only it * is invalid, but we cannot reliably locate the next entry. */ if (dm->length < sizeof(struct dmi_header)) { pr_warn(FW_BUG "Corrupted DMI table, offset %zd (only %d entries processed)\n", data - buf, i); break; } /* * We want to know the total length (formatted area and * strings) before decoding to make sure we won't run off the * table in dmi_decode or dmi_string */ data += dm->length; while ((data - buf < dmi_len - 1) && (data[0] || data[1])) data++; if (data - buf < dmi_len - 1) decode(dm, private_data); data += 2; i++; /* * 7.45 End-of-Table (Type 127) [SMBIOS reference spec v3.0.0] * For tables behind a 64-bit entry point, we have no item * count and no exact table length, so stop on end-of-table * marker. For tables behind a 32-bit entry point, we have * seen OEM structures behind the end-of-table marker on * some systems, so don't trust it. */ if (!dmi_num && dm->type == DMI_ENTRY_END_OF_TABLE) break; } /* Trim DMI table length if needed */ if (dmi_len > data - buf) dmi_len = data - buf; } static phys_addr_t dmi_base; static int __init dmi_walk_early(void (*decode)(const struct dmi_header *, void *)) { u8 *buf; u32 orig_dmi_len = dmi_len; buf = dmi_early_remap(dmi_base, orig_dmi_len); if (buf == NULL) return -ENOMEM; dmi_decode_table(buf, decode, NULL); add_device_randomness(buf, dmi_len); dmi_early_unmap(buf, orig_dmi_len); return 0; } static int __init dmi_checksum(const u8 *buf, u8 len) { u8 sum = 0; int a; for (a = 0; a < len; a++) sum += buf[a]; return sum == 0; } static const char *dmi_ident[DMI_STRING_MAX]; static LIST_HEAD(dmi_devices); int dmi_available; EXPORT_SYMBOL_GPL(dmi_available); /* * Save a DMI string */ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int string) { const char *d = (const char *) dm; const char *p; if (dmi_ident[slot] || dm->length <= string) return; p = dmi_string(dm, d[string]); if (p == NULL) return; dmi_ident[slot] = p; } static void __init dmi_save_release(const struct dmi_header *dm, int slot, int index) { const u8 *minor, *major; char *s; /* If the table doesn't have the field, let's return */ if (dmi_ident[slot] || dm->length < index) return; minor = (u8 *) dm + index; major = (u8 *) dm + index - 1; /* As per the spec, if the system doesn't support this field, * the value is FF */ if (*major == 0xFF && *minor == 0xFF) return; s = dmi_alloc(8); if (!s) return; sprintf(s, "%u.%u", *major, *minor); dmi_ident[slot] = s; } static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int index) { const u8 *d; char *s; int is_ff = 1, is_00 = 1, i; if (dmi_ident[slot] || dm->length < index + 16) return; d = (u8 *) dm + index; for (i = 0; i < 16 && (is_ff || is_00); i++) { if (d[i] != 0x00) is_00 = 0; if (d[i] != 0xFF) is_ff = 0; } if (is_ff || is_00) return; s = dmi_alloc(16*2+4+1); if (!s) return; /* * As of version 2.6 of the SMBIOS specification, the first 3 fields of * the UUID are supposed to be little-endian encoded. The specification * says that this is the defacto standard. */ if (dmi_ver >= 0x020600) sprintf(s, "%pUl", d); else sprintf(s, "%pUb", d); dmi_ident[slot] = s; } static void __init dmi_save_type(const struct dmi_header *dm, int slot, int index) { const u8 *d; char *s; if (dmi_ident[slot] || dm->length <= index) return; s = dmi_alloc(4); if (!s) return; d = (u8 *) dm + index; sprintf(s, "%u", *d & 0x7F); dmi_ident[slot] = s; } static void __init dmi_save_one_device(int type, const char *name) { struct dmi_device *dev; /* No duplicate device */ if (dmi_find_device(type, name, NULL)) return; dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1); if (!dev) return; dev->type = type; strcpy((char *)(dev + 1), name); dev->name = (char *)(dev + 1); dev->device_data = NULL; list_add(&dev->list, &dmi_devices); } static void __init dmi_save_devices(const struct dmi_header *dm) { int i, count = (dm->length - sizeof(struct dmi_header)) / 2; for (i = 0; i < count; i++) { const char *d = (char *)(dm + 1) + (i * 2); /* Skip disabled device */ if ((*d & 0x80) == 0) continue; dmi_save_one_device(*d & 0x7f, dmi_string_nosave(dm, *(d + 1))); } } static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) { int i, count; struct dmi_device *dev; if (dm->length < 0x05) return; count = *(u8 *)(dm + 1); for (i = 1; i <= count; i++) { const char *devname = dmi_string(dm, i); if (devname == dmi_empty_string) continue; dev = dmi_alloc(sizeof(*dev)); if (!dev) break; dev->type = DMI_DEV_TYPE_OEM_STRING; dev->name = devname; dev->device_data = NULL; list_add(&dev->list, &dmi_devices); } } static void __init dmi_save_ipmi_device(const struct dmi_header *dm) { struct dmi_device *dev; void *data; data = dmi_alloc(dm->length); if (data == NULL) return; memcpy(data, dm, dm->length); dev = dmi_alloc(sizeof(*dev)); if (!dev) return; dev->type = DMI_DEV_TYPE_IPMI; dev->name = "IPMI controller"; dev->device_data = data; list_add_tail(&dev->list, &dmi_devices); } static void __init dmi_save_dev_pciaddr(int instance, int segment, int bus, int devfn, const char *name, int type) { struct dmi_dev_onboard *dev; /* Ignore invalid values */ if (type == DMI_DEV_TYPE_DEV_SLOT && segment == 0xFFFF && bus == 0xFF && devfn == 0xFF) return; dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1); if (!dev) return; dev->instance = instance; dev->segment = segment; dev->bus = bus; dev->devfn = devfn; strcpy((char *)&dev[1], name); dev->dev.type = type; dev->dev.name = (char *)&dev[1]; dev->dev.device_data = dev; list_add(&dev->dev.list, &dmi_devices); } static void __init dmi_save_extended_devices(const struct dmi_header *dm) { const char *name; const u8 *d = (u8 *)dm; if (dm->length < 0x0B) return; /* Skip disabled device */ if ((d[0x5] & 0x80) == 0) return; name = dmi_string_nosave(dm, d[0x4]); dmi_save_dev_pciaddr(d[0x6], *(u16 *)(d + 0x7), d[0x9], d[0xA], name, DMI_DEV_TYPE_DEV_ONBOARD); dmi_save_one_device(d[0x5] & 0x7f, name); } static void __init dmi_save_system_slot(const struct dmi_header *dm) { const u8 *d = (u8 *)dm; /* Need SMBIOS 2.6+ structure */ if (dm->length < 0x11) return; dmi_save_dev_pciaddr(*(u16 *)(d + 0x9), *(u16 *)(d + 0xD), d[0xF], d[0x10], dmi_string_nosave(dm, d[0x4]), DMI_DEV_TYPE_DEV_SLOT); } static void __init count_mem_devices(const struct dmi_header *dm, void *v) { if (dm->type != DMI_ENTRY_MEM_DEVICE) return; dmi_memdev_nr++; } static void __init save_mem_devices(const struct dmi_header *dm, void *v) { const char *d = (const char *)dm; static int nr; u64 bytes; u16 size; if (dm->type != DMI_ENTRY_MEM_DEVICE || dm->length < 0x13) return; if (nr >= dmi_memdev_nr) { pr_warn(FW_BUG "Too many DIMM entries in SMBIOS table\n"); return; } dmi_memdev[nr].handle = get_unaligned(&dm->handle); dmi_memdev[nr].device = dmi_string(dm, d[0x10]); dmi_memdev[nr].bank = dmi_string(dm, d[0x11]); dmi_memdev[nr].type = d[0x12]; size = get_unaligned((u16 *)&d[0xC]); if (size == 0) bytes = 0; else if (size == 0xffff) bytes = ~0ull; else if (size & 0x8000) bytes = (u64)(size & 0x7fff) << 10; else if (size != 0x7fff || dm->length < 0x20) bytes = (u64)size << 20; else bytes = (u64)get_unaligned((u32 *)&d[0x1C]) << 20; if (bytes) dmi_memdev_populated_nr++; dmi_memdev[nr].size = bytes; nr++; } static void __init dmi_memdev_walk(void) { if (dmi_walk_early(count_mem_devices) == 0 && dmi_memdev_nr) { dmi_memdev = dmi_alloc(sizeof(*dmi_memdev) * dmi_memdev_nr); if (dmi_memdev) dmi_walk_early(save_mem_devices); } } /* * Process a DMI table entry. Right now all we care about are the BIOS * and machine entries. For 2.5 we should pull the smbus controller info * out of here. */ static void __init dmi_decode(const struct dmi_header *dm, void *dummy) { switch (dm->type) { case 0: /* BIOS Information */ dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); dmi_save_ident(dm, DMI_BIOS_VERSION, 5); dmi_save_ident(dm, DMI_BIOS_DATE, 8); dmi_save_release(dm, DMI_BIOS_RELEASE, 21); dmi_save_release(dm, DMI_EC_FIRMWARE_RELEASE, 23); break; case 1: /* System Information */ dmi_save_ident(dm, DMI_SYS_VENDOR, 4); dmi_save_ident(dm, DMI_PRODUCT_NAME, 5); dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7); dmi_save_uuid(dm, DMI_PRODUCT_UUID, 8); dmi_save_ident(dm, DMI_PRODUCT_SKU, 25); dmi_save_ident(dm, DMI_PRODUCT_FAMILY, 26); break; case 2: /* Base Board Information */ dmi_save_ident(dm, DMI_BOARD_VENDOR, 4); dmi_save_ident(dm, DMI_BOARD_NAME, 5); dmi_save_ident(dm, DMI_BOARD_VERSION, 6); dmi_save_ident(dm, DMI_BOARD_SERIAL, 7); dmi_save_ident(dm, DMI_BOARD_ASSET_TAG, 8); break; case 3: /* Chassis Information */ dmi_save_ident(dm, DMI_CHASSIS_VENDOR, 4); dmi_save_type(dm, DMI_CHASSIS_TYPE, 5); dmi_save_ident(dm, DMI_CHASSIS_VERSION, 6); dmi_save_ident(dm, DMI_CHASSIS_SERIAL, 7); dmi_save_ident(dm, DMI_CHASSIS_ASSET_TAG, 8); break; case 9: /* System Slots */ dmi_save_system_slot(dm); break; case 10: /* Onboard Devices Information */ dmi_save_devices(dm); break; case 11: /* OEM Strings */ dmi_save_oem_strings_devices(dm); break; case 38: /* IPMI Device Information */ dmi_save_ipmi_device(dm); break; case 41: /* Onboard Devices Extended Information */ dmi_save_extended_devices(dm); } } static int __init print_filtered(char *buf, size_t len, const char *info) { int c = 0; const char *p; if (!info) return c; for (p = info; *p; p++) if (isprint(*p)) c += scnprintf(buf + c, len - c, "%c", *p); else c += scnprintf(buf + c, len - c, "\\x%02x", *p & 0xff); return c; } static void __init dmi_format_ids(char *buf, size_t len) { int c = 0; const char *board; /* Board Name is optional */ c += print_filtered(buf + c, len - c, dmi_get_system_info(DMI_SYS_VENDOR)); c += scnprintf(buf + c, len - c, " "); c += print_filtered(buf + c, len - c, dmi_get_system_info(DMI_PRODUCT_NAME)); board = dmi_get_system_info(DMI_BOARD_NAME); if (board) { c += scnprintf(buf + c, len - c, "/"); c += print_filtered(buf + c, len - c, board); } c += scnprintf(buf + c, len - c, ", BIOS "); c += print_filtered(buf + c, len - c, dmi_get_system_info(DMI_BIOS_VERSION)); c += scnprintf(buf + c, len - c, " "); c += print_filtered(buf + c, len - c, dmi_get_system_info(DMI_BIOS_DATE)); } /* * Check for DMI/SMBIOS headers in the system firmware image. Any * SMBIOS header must start 16 bytes before the DMI header, so take a * 32 byte buffer and check for DMI at offset 16 and SMBIOS at offset * 0. If the DMI header is present, set dmi_ver accordingly (SMBIOS * takes precedence) and return 0. Otherwise return 1. */ static int __init dmi_present(const u8 *buf) { u32 smbios_ver; /* * The size of this structure is 31 bytes, but we also accept value * 30 due to a mistake in SMBIOS specification version 2.1. */ if (memcmp(buf, "_SM_", 4) == 0 && buf[5] >= 30 && buf[5] <= 32 && dmi_checksum(buf, buf[5])) { smbios_ver = get_unaligned_be16(buf + 6); smbios_entry_point_size = buf[5]; memcpy(smbios_entry_point, buf, smbios_entry_point_size); /* Some BIOS report weird SMBIOS version, fix that up */ switch (smbios_ver) { case 0x021F: case 0x0221: pr_debug("SMBIOS version fixup (2.%d->2.%d)\n", smbios_ver & 0xFF, 3); smbios_ver = 0x0203; break; case 0x0233: pr_debug("SMBIOS version fixup (2.%d->2.%d)\n", 51, 6); smbios_ver = 0x0206; break; } } else { smbios_ver = 0; } buf += 16; if (memcmp(buf, "_DMI_", 5) == 0 && dmi_checksum(buf, 15)) { if (smbios_ver) dmi_ver = smbios_ver; else dmi_ver = (buf[14] & 0xF0) << 4 | (buf[14] & 0x0F); dmi_ver <<= 8; dmi_num = get_unaligned_le16(buf + 12); dmi_len = get_unaligned_le16(buf + 6); dmi_base = get_unaligned_le32(buf + 8); if (dmi_walk_early(dmi_decode) == 0) { if (smbios_ver) { pr_info("SMBIOS %d.%d present.\n", dmi_ver >> 16, (dmi_ver >> 8) & 0xFF); } else { smbios_entry_point_size = 15; memcpy(smbios_entry_point, buf, smbios_entry_point_size); pr_info("Legacy DMI %d.%d present.\n", dmi_ver >> 16, (dmi_ver >> 8) & 0xFF); } dmi_format_ids(dmi_ids_string, sizeof(dmi_ids_string)); pr_info("DMI: %s\n", dmi_ids_string); return 0; } } return 1; } /* * Check for the SMBIOS 3.0 64-bit entry point signature. Unlike the legacy * 32-bit entry point, there is no embedded DMI header (_DMI_) in here. */ static int __init dmi_smbios3_present(const u8 *buf) { if (memcmp(buf, "_SM3_", 5) == 0 && buf[6] >= 24 && buf[6] <= 32 && dmi_checksum(buf, buf[6])) { dmi_ver = get_unaligned_be24(buf + 7); dmi_num = 0; /* No longer specified */ dmi_len = get_unaligned_le32(buf + 12); dmi_base = get_unaligned_le64(buf + 16); smbios_entry_point_size = buf[6]; memcpy(smbios_entry_point, buf, smbios_entry_point_size); if (dmi_walk_early(dmi_decode) == 0) { pr_info("SMBIOS %d.%d.%d present.\n", dmi_ver >> 16, (dmi_ver >> 8) & 0xFF, dmi_ver & 0xFF); dmi_format_ids(dmi_ids_string, sizeof(dmi_ids_string)); pr_info("DMI: %s\n", dmi_ids_string); return 0; } } return 1; } static void __init dmi_scan_machine(void) { char __iomem *p, *q; char buf[32]; if (efi_enabled(EFI_CONFIG_TABLES)) { /* * According to the DMTF SMBIOS reference spec v3.0.0, it is * allowed to define both the 64-bit entry point (smbios3) and * the 32-bit entry point (smbios), in which case they should * either both point to the same SMBIOS structure table, or the * table pointed to by the 64-bit entry point should contain a * superset of the table contents pointed to by the 32-bit entry * point (section 5.2) * This implies that the 64-bit entry point should have * precedence if it is defined and supported by the OS. If we * have the 64-bit entry point, but fail to decode it, fall * back to the legacy one (if available) */ if (efi.smbios3 != EFI_INVALID_TABLE_ADDR) { p = dmi_early_remap(efi.smbios3, 32); if (p == NULL) goto error; memcpy_fromio(buf, p, 32); dmi_early_unmap(p, 32); if (!dmi_smbios3_present(buf)) { dmi_available = 1; return; } } if (efi.smbios == EFI_INVALID_TABLE_ADDR) goto error; /* This is called as a core_initcall() because it isn't * needed during early boot. This also means we can * iounmap the space when we're done with it. */ p = dmi_early_remap(efi.smbios, 32); if (p == NULL) goto error; memcpy_fromio(buf, p, 32); dmi_early_unmap(p, 32); if (!dmi_present(buf)) { dmi_available = 1; return; } } else if (IS_ENABLED(CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK)) { p = dmi_early_remap(SMBIOS_ENTRY_POINT_SCAN_START, 0x10000); if (p == NULL) goto error; /* * Same logic as above, look for a 64-bit entry point * first, and if not found, fall back to 32-bit entry point. */ memcpy_fromio(buf, p, 16); for (q = p + 16; q < p + 0x10000; q += 16) { memcpy_fromio(buf + 16, q, 16); if (!dmi_smbios3_present(buf)) { dmi_available = 1; dmi_early_unmap(p, 0x10000); return; } memcpy(buf, buf + 16, 16); } /* * Iterate over all possible DMI header addresses q. * Maintain the 32 bytes around q in buf. On the * first iteration, substitute zero for the * out-of-range bytes so there is no chance of falsely * detecting an SMBIOS header. */ memset(buf, 0, 16); for (q = p; q < p + 0x10000; q += 16) { memcpy_fromio(buf + 16, q, 16); if (!dmi_present(buf)) { dmi_available = 1; dmi_early_unmap(p, 0x10000); return; } memcpy(buf, buf + 16, 16); } dmi_early_unmap(p, 0x10000); } error: pr_info("DMI not present or invalid.\n"); } static BIN_ATTR_SIMPLE_ADMIN_RO(smbios_entry_point); static BIN_ATTR_SIMPLE_ADMIN_RO(DMI); static int __init dmi_init(void) { struct kobject *tables_kobj; u8 *dmi_table; int ret = -ENOMEM; if (!dmi_available) return 0; /* * Set up dmi directory at /sys/firmware/dmi. This entry should stay * even after farther error, as it can be used by other modules like * dmi-sysfs. */ dmi_kobj = kobject_create_and_add("dmi", firmware_kobj); if (!dmi_kobj) goto err; tables_kobj = kobject_create_and_add("tables", dmi_kobj); if (!tables_kobj) goto err; dmi_table = dmi_remap(dmi_base, dmi_len); if (!dmi_table) goto err_tables; bin_attr_smbios_entry_point.size = smbios_entry_point_size; bin_attr_smbios_entry_point.private = smbios_entry_point; ret = sysfs_create_bin_file(tables_kobj, &bin_attr_smbios_entry_point); if (ret) goto err_unmap; bin_attr_DMI.size = dmi_len; bin_attr_DMI.private = dmi_table; ret = sysfs_create_bin_file(tables_kobj, &bin_attr_DMI); if (!ret) return 0; sysfs_remove_bin_file(tables_kobj, &bin_attr_smbios_entry_point); err_unmap: dmi_unmap(dmi_table); err_tables: kobject_del(tables_kobj); kobject_put(tables_kobj); err: pr_err("dmi: Firmware registration failed.\n"); return ret; } subsys_initcall(dmi_init); /** * dmi_setup - scan and setup DMI system information * * Scan the DMI system information. This setups DMI identifiers * (dmi_system_id) for printing it out on task dumps and prepares * DIMM entry information (dmi_memdev_info) from the SMBIOS table * for using this when reporting memory errors. */ void __init dmi_setup(void) { dmi_scan_machine(); if (!dmi_available) return; dmi_memdev_walk(); pr_info("DMI: Memory slots populated: %d/%d\n", dmi_memdev_populated_nr, dmi_memdev_nr); dump_stack_set_arch_desc("%s", dmi_ids_string); } /** * dmi_matches - check if dmi_system_id structure matches system DMI data * @dmi: pointer to the dmi_system_id structure to check */ static bool dmi_matches(const struct dmi_system_id *dmi) { int i; for (i = 0; i < ARRAY_SIZE(dmi->matches); i++) { int s = dmi->matches[i].slot; if (s == DMI_NONE) break; if (s == DMI_OEM_STRING) { /* DMI_OEM_STRING must be exact match */ const struct dmi_device *valid; valid = dmi_find_device(DMI_DEV_TYPE_OEM_STRING, dmi->matches[i].substr, NULL); if (valid) continue; } else if (dmi_ident[s]) { if (dmi->matches[i].exact_match) { if (!strcmp(dmi_ident[s], dmi->matches[i].substr)) continue; } else { if (strstr(dmi_ident[s], dmi->matches[i].substr)) continue; } } /* No match */ return false; } return true; } /** * dmi_is_end_of_table - check for end-of-table marker * @dmi: pointer to the dmi_system_id structure to check */ static bool dmi_is_end_of_table(const struct dmi_system_id *dmi) { return dmi->matches[0].slot == DMI_NONE; } /** * dmi_check_system - check system DMI data * @list: array of dmi_system_id structures to match against * All non-null elements of the list must match * their slot's (field index's) data (i.e., each * list string must be a substring of the specified * DMI slot's string data) to be considered a * successful match. * * Walk the blacklist table running matching functions until someone * returns non zero or we hit the end. Callback function is called for * each successful match. Returns the number of matches. * * dmi_setup must be called before this function is called. */ int dmi_check_system(const struct dmi_system_id *list) { int count = 0; const struct dmi_system_id *d; for (d = list; !dmi_is_end_of_table(d); d++) if (dmi_matches(d)) { count++; if (d->callback && d->callback(d)) break; } return count; } EXPORT_SYMBOL(dmi_check_system); /** * dmi_first_match - find dmi_system_id structure matching system DMI data * @list: array of dmi_system_id structures to match against * All non-null elements of the list must match * their slot's (field index's) data (i.e., each * list string must be a substring of the specified * DMI slot's string data) to be considered a * successful match. * * Walk the blacklist table until the first match is found. Return the * pointer to the matching entry or NULL if there's no match. * * dmi_setup must be called before this function is called. */ const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list) { const struct dmi_system_id *d; for (d = list; !dmi_is_end_of_table(d); d++) if (dmi_matches(d)) return d; return NULL; } EXPORT_SYMBOL(dmi_first_match); /** * dmi_get_system_info - return DMI data value * @field: data index (see enum dmi_field) * * Returns one DMI data value, can be used to perform * complex DMI data checks. */ const char *dmi_get_system_info(int field) { return dmi_ident[field]; } EXPORT_SYMBOL(dmi_get_system_info); /** * dmi_name_in_serial - Check if string is in the DMI product serial information * @str: string to check for */ int dmi_name_in_serial(const char *str) { int f = DMI_PRODUCT_SERIAL; if (dmi_ident[f] && strstr(dmi_ident[f], str)) return 1; return 0; } /** * dmi_name_in_vendors - Check if string is in the DMI system or board vendor name * @str: Case sensitive Name */ int dmi_name_in_vendors(const char *str) { static int fields[] = { DMI_SYS_VENDOR, DMI_BOARD_VENDOR, DMI_NONE }; int i; for (i = 0; fields[i] != DMI_NONE; i++) { int f = fields[i]; if (dmi_ident[f] && strstr(dmi_ident[f], str)) return 1; } return 0; } EXPORT_SYMBOL(dmi_name_in_vendors); /** * dmi_find_device - find onboard device by type/name * @type: device type or %DMI_DEV_TYPE_ANY to match all device types * @name: device name string or %NULL to match all * @from: previous device found in search, or %NULL for new search. * * Iterates through the list of known onboard devices. If a device is * found with a matching @type and @name, a pointer to its device * structure is returned. Otherwise, %NULL is returned. * A new search is initiated by passing %NULL as the @from argument. * If @from is not %NULL, searches continue from next device. */ const struct dmi_device *dmi_find_device(int type, const char *name, const struct dmi_device *from) { const struct list_head *head = from ? &from->list : &dmi_devices; struct list_head *d; for (d = head->next; d != &dmi_devices; d = d->next) { const struct dmi_device *dev = list_entry(d, struct dmi_device, list); if (((type == DMI_DEV_TYPE_ANY) || (dev->type == type)) && ((name == NULL) || (strcmp(dev->name, name) == 0))) return dev; } return NULL; } EXPORT_SYMBOL(dmi_find_device); /** * dmi_get_date - parse a DMI date * @field: data index (see enum dmi_field) * @yearp: optional out parameter for the year * @monthp: optional out parameter for the month * @dayp: optional out parameter for the day * * The date field is assumed to be in the form resembling * [mm[/dd]]/yy[yy] and the result is stored in the out * parameters any or all of which can be omitted. * * If the field doesn't exist, all out parameters are set to zero * and false is returned. Otherwise, true is returned with any * invalid part of date set to zero. * * On return, year, month and day are guaranteed to be in the * range of [0,9999], [0,12] and [0,31] respectively. */ bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp) { int year = 0, month = 0, day = 0; bool exists; const char *s, *y; char *e; s = dmi_get_system_info(field); exists = s; if (!exists) goto out; /* * Determine year first. We assume the date string resembles * mm/dd/yy[yy] but the original code extracted only the year * from the end. Keep the behavior in the spirit of no * surprises. */ y = strrchr(s, '/'); if (!y) goto out; y++; year = simple_strtoul(y, &e, 10); if (y != e && year < 100) { /* 2-digit year */ year += 1900; if (year < 1996) /* no dates < spec 1.0 */ year += 100; } if (year > 9999) /* year should fit in %04d */ year = 0; /* parse the mm and dd */ month = simple_strtoul(s, &e, 10); if (s == e || *e != '/' || !month || month > 12) { month = 0; goto out; } s = e + 1; day = simple_strtoul(s, &e, 10); if (s == y || s == e || *e != '/' || day > 31) day = 0; out: if (yearp) *yearp = year; if (monthp) *monthp = month; if (dayp) *dayp = day; return exists; } EXPORT_SYMBOL(dmi_get_date); /** * dmi_get_bios_year - get a year out of DMI_BIOS_DATE field * * Returns year on success, -ENXIO if DMI is not selected, * or a different negative error code if DMI field is not present * or not parseable. */ int dmi_get_bios_year(void) { bool exists; int year; exists = dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL); if (!exists) return -ENODATA; return year ? year : -ERANGE; } EXPORT_SYMBOL(dmi_get_bios_year); /** * dmi_walk - Walk the DMI table and get called back for every record * @decode: Callback function * @private_data: Private data to be passed to the callback function * * Returns 0 on success, -ENXIO if DMI is not selected or not present, * or a different negative error code if DMI walking fails. */ int dmi_walk(void (*decode)(const struct dmi_header *, void *), void *private_data) { u8 *buf; if (!dmi_available) return -ENXIO; buf = dmi_remap(dmi_base, dmi_len); if (buf == NULL) return -ENOMEM; dmi_decode_table(buf, decode, private_data); dmi_unmap(buf); return 0; } EXPORT_SYMBOL_GPL(dmi_walk); /** * dmi_match - compare a string to the dmi field (if exists) * @f: DMI field identifier * @str: string to compare the DMI field to * * Returns true if the requested field equals to the str (including NULL). */ bool dmi_match(enum dmi_field f, const char *str) { const char *info = dmi_get_system_info(f); if (info == NULL || str == NULL) return info == str; return !strcmp(info, str); } EXPORT_SYMBOL_GPL(dmi_match); void dmi_memdev_name(u16 handle, const char **bank, const char **device) { int n; if (dmi_memdev == NULL) return; for (n = 0; n < dmi_memdev_nr; n++) { if (handle == dmi_memdev[n].handle) { *bank = dmi_memdev[n].bank; *device = dmi_memdev[n].device; break; } } } EXPORT_SYMBOL_GPL(dmi_memdev_name); u64 dmi_memdev_size(u16 handle) { int n; if (dmi_memdev) { for (n = 0; n < dmi_memdev_nr; n++) { if (handle == dmi_memdev[n].handle) return dmi_memdev[n].size; } } return ~0ull; } EXPORT_SYMBOL_GPL(dmi_memdev_size); /** * dmi_memdev_type - get the memory type * @handle: DMI structure handle * * Return the DMI memory type of the module in the slot associated with the * given DMI handle, or 0x0 if no such DMI handle exists. */ u8 dmi_memdev_type(u16 handle) { int n; if (dmi_memdev) { for (n = 0; n < dmi_memdev_nr; n++) { if (handle == dmi_memdev[n].handle) return dmi_memdev[n].type; } } return 0x0; /* Not a valid value */ } EXPORT_SYMBOL_GPL(dmi_memdev_type); /** * dmi_memdev_handle - get the DMI handle of a memory slot * @slot: slot number * * Return the DMI handle associated with a given memory slot, or %0xFFFF * if there is no such slot. */ u16 dmi_memdev_handle(int slot) { if (dmi_memdev && slot >= 0 && slot < dmi_memdev_nr) return dmi_memdev[slot].handle; return 0xffff; /* Not a valid value */ } EXPORT_SYMBOL_GPL(dmi_memdev_handle);
58 58 48 58 21 58 32 58 20 21 21 21 59 58 59 58 46 51 5 59 42 48 13 59 57 58 58 58 58 61 55 6 3 1 1 1 1 1 1 3 3 62 59 62 61 58 60 21 60 32 59 60 59 60 60 60 60 61 22 23 6 255 253 256 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 // SPDX-License-Identifier: GPL-2.0 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.rst */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ /* * Time after which to dispatch lower priority requests even if higher * priority requests are pending. */ static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ enum dd_data_dir { DD_READ = READ, DD_WRITE = WRITE, }; enum { DD_DIR_COUNT = 2 }; enum dd_prio { DD_RT_PRIO = 0, DD_BE_PRIO = 1, DD_IDLE_PRIO = 2, DD_PRIO_MAX = 2, }; enum { DD_PRIO_COUNT = 3 }; /* * I/O statistics per I/O priority. It is fine if these counters overflow. * What matters is that these counters are at least as wide as * log2(max_outstanding_requests). */ struct io_stats_per_prio { uint32_t inserted; uint32_t merged; uint32_t dispatched; atomic_t completed; }; /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; struct deadline_data { /* * run time data */ struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; u32 async_depth; int prio_aging_expire; spinlock_t lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_RT] = DD_RT_PRIO, [IOPRIO_CLASS_BE] = DD_BE_PRIO, [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { return &per_prio->sort_list[rq_data_dir(rq)]; } /* * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a * request. */ static u8 dd_rq_ioclass(struct request *rq) { return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) { struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; if (!node) return NULL; rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { res = rq; node = node->rb_left; } else { node = node->rb_right; } } return res; } static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct dd_per_prio *per_prio, struct request *rq) { list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(req); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(per_prio, req), req); deadline_add_rq_rb(per_prio, req); } } /* * Callback function that is invoked after @next has been merged into @req. */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; lockdep_assert_held(&dd->lock); dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, per_prio, rq); } /* Number of requests queued for a given priority level. */ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->inserted - atomic_read(&stats->completed); } /* * deadline_check_fifo returns true if and only if there are expired requests * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* * For the specified data direction, return the next request to * dispatch using sector position sorted lists. */ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { return deadline_from_pos(per_prio, data_dir, per_prio->latest_pos[data_dir]); } /* * Returns true if and only if @rq started after @latest_start where * @latest_start is in jiffies. */ static bool started_after(struct deadline_data *dd, struct request *rq, unsigned long latest_start) { unsigned long start_time = (unsigned long)rq->fifo_time; start_time -= dd->fifo_expire[rq_data_dir(rq)]; return time_after(start_time, latest_start); } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, struct dd_per_prio *per_prio, unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; enum dd_prio prio; u8 ioprio_class; lockdep_assert_held(&dd->lock); if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); if (started_after(dd, rq, latest_start)) return NULL; list_del_init(&rq->queuelist); data_dir = rq_data_dir(rq); goto done; } /* * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) { /* we have a next request and are still entitled to batch */ data_dir = rq_data_dir(rq); goto dispatch_request; } /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (!list_empty(&per_prio->fifo_list[DD_READ])) { BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = DD_READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; data_dir = DD_WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ next_rq = deadline_next_request(dd, per_prio, data_dir); if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = next_rq; } if (!rq) return NULL; dd->last_dir = data_dir; dd->batching = 0; dispatch_request: if (started_after(dd, rq, latest_start)) return NULL; /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, per_prio, rq); done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; rq->rq_flags |= RQF_STARTED; return rq; } /* * Check whether there are any requests with priority other than DD_RT_PRIO * that were inserted more than prio_aging_expire jiffies ago. */ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, unsigned long now) { struct request *rq; enum dd_prio prio; int prio_cnt; lockdep_assert_held(&dd->lock); prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + !!dd_queued(dd, DD_IDLE_PRIO); if (prio_cnt < 2) return NULL; for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now - dd->prio_aging_expire); if (rq) return rq; } return NULL; } /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; /* * Next, dispatch requests in priority order. Ignore lower priority * requests if any higher priority requests are pending. */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); if (rq || dd_queued(dd, prio)) break; } unlock: spin_unlock(&dd->lock); return rq; } /* * 'depth' is a number in the range 1..INT_MAX representing a number of * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). * Values larger than q->nr_requests have the same effect as q->nr_requests. */ static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) { struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; const unsigned int nrr = hctx->queue->nr_requests; return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; } /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ if (op_is_sync(opf) && !op_is_write(opf)) return; /* * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); } /* Called by blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { dd_depth_updated(hctx); return 0; } static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; const struct io_stats_per_prio *stats = &per_prio->stats; uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); spin_lock(&dd->lock); queued = dd_queued(dd, prio); spin_unlock(&dd->lock); WARN_ONCE(queued != 0, "statistics for priority %d: i %u m %u d %u c %u\n", prio, stats->inserted, stats->merged, stats->dispatched, atomic_read(&stats->completed)); } kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; enum dd_prio prio; int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) goto put_eq; eq->elevator_data = dd; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; per_prio->sort_list[DD_WRITE] = RB_ROOT; } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; return 0; put_eq: kobject_put(&eq->kobj); return ret; } /* * Try to merge @bio into an existing request. If @bio has been merged into * an existing request, store the pointer to that request into *@rq. */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } /* * Attempt to merge a bio into an existing request. This function is called * before @bio is associated with a request. */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; lockdep_assert_held(&dd->lock); prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) per_prio->stats.inserted++; rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { struct list_head *insert_before; deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; insert_before = &per_prio->fifo_list[data_dir]; list_add_tail(&rq->queuelist, insert_before); } } /* * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { rq->elv.priv[0] = NULL; } /* * Callback from inside blk_mq_free_request(). */ static void dd_finish_request(struct request *rq) { struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ if (per_prio) atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { return !list_empty_careful(&per_prio->dispatch) || !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; return false; } /* * sysfs parts below */ #define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ \ return sysfs_emit(page, "%d\n", __VAR); \ } #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data, __ret; \ \ __ret = kstrtoint(page, 0, &__data); \ if (__ret < 0) \ return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __CONV(__data); \ return count; \ } #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT #undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ struct request *rq; \ \ rq = deadline_from_pos(per_prio, data_dir, \ per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static int dd_async_depth_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->async_depth); return 0; } static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_queued(dd, DD_RT_PRIO); be = dd_queued(dd, DD_BE_PRIO); idle = dd_queued(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->dispatched + stats->merged - atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_owned_by_driver(dd, DD_RT_PRIO); be = dd_owned_by_driver(dd, DD_BE_PRIO); idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->dispatch, *pos); \ } \ \ static void *deadline_dispatch##prio##_next(struct seq_file *m, \ void *v, loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->dispatch, pos); \ } \ \ static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ .start = deadline_dispatch##prio##_start, \ .next = deadline_dispatch##prio##_next, \ .stop = deadline_dispatch##prio##_stop, \ .show = blk_mq_debugfs_rq_show, \ } DEADLINE_DISPATCH_ATTR(0); DEADLINE_DISPATCH_ATTR(1); DEADLINE_DISPATCH_ATTR(2); #undef DEADLINE_DISPATCH_ATTR #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ .seq_ops = &deadline_##name##_fifo_seq_ops} #define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read0), DEADLINE_QUEUE_DDIR_ATTRS(write0), DEADLINE_QUEUE_DDIR_ATTRS(read1), DEADLINE_QUEUE_DDIR_ATTRS(write1), DEADLINE_QUEUE_DDIR_ATTRS(read2), DEADLINE_QUEUE_DDIR_ATTRS(write2), DEADLINE_NEXT_RQ_ATTR(read0), DEADLINE_NEXT_RQ_ATTR(write0), DEADLINE_NEXT_RQ_ATTR(read1), DEADLINE_NEXT_RQ_ATTR(write1), DEADLINE_NEXT_RQ_ATTR(read2), DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops = { .depth_updated = dd_depth_updated, .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler");
41 41 81 22 60 421 110 332 1 1 1 1 1 1 41 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto.c: transport protocol load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <linux/in.h> #include <linux/ip.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <net/ip_vs.h> /* * IPVS protocols can only be registered/unregistered when the ipvs * module is loaded/unloaded, so no lock is needed in accessing the * ipvs protocol table. */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ #define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; /* States for conn templates: NONE or words separated with ",", max 15 chars */ static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { [IP_VS_CTPL_S_NONE] = "NONE", [IP_VS_CTPL_S_ASSURED] = "ASSURED", }; /* * register an ipvs protocol */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; if (pp->init != NULL) pp->init(pp); return 0; } /* * register an ipvs protocols netns related data */ static int register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); if (!pd) return -ENOMEM; pd->pp = pp; /* For speed issues */ pd->next = ipvs->proto_data_table[hash]; ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ if (pp->init_netns != NULL) { int ret = pp->init_netns(ipvs, pd); if (ret) { /* unlink an free proto data */ ipvs->proto_data_table[hash] = pd->next; kfree(pd); return ret; } } return 0; } /* * unregister an ipvs protocol */ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { if (*pp_p == pp) { *pp_p = pp->next; if (pp->exit != NULL) pp->exit(pp); return 0; } } return -ESRCH; } /* * unregister an ipvs protocols netns data */ static int unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { struct ip_vs_proto_data **pd_p; unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); pd_p = &ipvs->proto_data_table[hash]; for (; *pd_p; pd_p = &(*pd_p)->next) { if (*pd_p == pd) { *pd_p = pd->next; if (pd->pp->exit_netns != NULL) pd->pp->exit_netns(ipvs, pd); kfree(pd); return 0; } } return -ESRCH; } /* * get ip_vs_protocol object by its proto. */ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) return pp; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ struct ip_vs_proto_data * ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { if (pd->pp->protocol == proto) return pd; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols */ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) { struct ip_vs_proto_data *pd; int i; for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { if (pd->pp->timeout_change) pd->pp->timeout_change(pd, flags); } } } int * ip_vs_create_timeout_table(int *table, int size) { return kmemdup(table, size, GFP_KERNEL); } const char *ip_vs_state_name(const struct ip_vs_conn *cp) { unsigned int state = cp->state; struct ip_vs_protocol *pp; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { if (state >= IP_VS_CTPL_S_LAST) return "ERR!"; return ip_vs_ctpl_state_name_table[state] ? : "?"; } pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; return pp->state_name(state); } static void ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[128]; struct iphdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI4->%pI4", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI4:%u->%pI4:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #ifdef CONFIG_IP_VS_IPV6 static void ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[192]; struct ipv6hdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->nexthdr == IPPROTO_FRAGMENT) sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI6c->%pI6c", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI6c:%u->%pI6c:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #endif void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); } /* * per network name-space init */ int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) { int i, ret; static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP &ip_vs_protocol_esp, #endif }; for (i = 0; i < ARRAY_SIZE(protos); i++) { ret = register_ip_vs_proto_netns(ipvs, protos[i]); if (ret < 0) goto cleanup; } return 0; cleanup: ip_vs_protocol_net_cleanup(ipvs); return ret; } void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_proto_data *pd; int i; /* unregister all the ipvs proto data for this netns */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pd = ipvs->proto_data_table[i]) != NULL) unregister_ip_vs_proto_netns(ipvs, pd); } } int __init ip_vs_protocol_init(void) { char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ strcat(protocols, ", "); \ strcat(protocols, (p)->name); \ } while (0) #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif #ifdef CONFIG_IP_VS_PROTO_UDP REGISTER_PROTOCOL(&ip_vs_protocol_udp); #endif #ifdef CONFIG_IP_VS_PROTO_SCTP REGISTER_PROTOCOL(&ip_vs_protocol_sctp); #endif #ifdef CONFIG_IP_VS_PROTO_AH REGISTER_PROTOCOL(&ip_vs_protocol_ah); #endif #ifdef CONFIG_IP_VS_PROTO_ESP REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); return 0; } void ip_vs_protocol_cleanup(void) { struct ip_vs_protocol *pp; int i; /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) unregister_ip_vs_protocol(pp); } }
4645 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") ); TRACE_EVENT(mm_lru_activate, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); ), TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. ** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* ******************************************************************************/ #include <linux/kernel.h> #include <linux/init.h> #include <linux/configfs.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/dlmconstants.h> #include <net/ipv6.h> #include <net/sock.h> #include "config.h" #include "midcomms.h" #include "lowcomms.h" /* * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>) * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight * /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>) * /config/dlm/<cluster>/comms/<comm>/local * /config/dlm/<cluster>/comms/<comm>/addr (write only) * /config/dlm/<cluster>/comms/<comm>/addr_list (read only) * The <cluster> level is useless, but I haven't figured out how to avoid it. */ static struct config_group *space_list; static struct config_group *comm_list; static struct dlm_comm *local_comm; static uint32_t dlm_comm_count; struct dlm_clusters; struct dlm_cluster; struct dlm_spaces; struct dlm_space; struct dlm_comms; struct dlm_comm; struct dlm_nodes; struct dlm_node; static struct config_group *make_cluster(struct config_group *, const char *); static void drop_cluster(struct config_group *, struct config_item *); static void release_cluster(struct config_item *); static struct config_group *make_space(struct config_group *, const char *); static void drop_space(struct config_group *, struct config_item *); static void release_space(struct config_item *); static struct config_item *make_comm(struct config_group *, const char *); static void drop_comm(struct config_group *, struct config_item *); static void release_comm(struct config_item *); static struct config_item *make_node(struct config_group *, const char *); static void drop_node(struct config_group *, struct config_item *); static void release_node(struct config_item *); static struct configfs_attribute *comm_attrs[]; static struct configfs_attribute *node_attrs[]; const struct rhashtable_params dlm_rhash_rsb_params = { .nelem_hint = 3, /* start small */ .key_len = DLM_RESNAME_MAXLEN, .key_offset = offsetof(struct dlm_rsb, res_name), .head_offset = offsetof(struct dlm_rsb, res_node), .automatic_shrinking = true, }; struct dlm_cluster { struct config_group group; struct dlm_spaces *sps; struct dlm_comms *cms; }; static struct dlm_cluster *config_item_to_cluster(struct config_item *i) { return i ? container_of(to_config_group(i), struct dlm_cluster, group) : NULL; } enum { CLUSTER_ATTR_TCP_PORT = 0, CLUSTER_ATTR_BUFFER_SIZE, CLUSTER_ATTR_RSBTBL_SIZE, CLUSTER_ATTR_RECOVER_TIMER, CLUSTER_ATTR_TOSS_SECS, CLUSTER_ATTR_SCAN_SECS, CLUSTER_ATTR_LOG_DEBUG, CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, }; static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf) { return sprintf(buf, "%s\n", dlm_config.ci_cluster_name); } static ssize_t cluster_cluster_name_store(struct config_item *item, const char *buf, size_t len) { strscpy(dlm_config.ci_cluster_name, buf, sizeof(dlm_config.ci_cluster_name)); return len; } CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_tcp_port_show(struct config_item *item, char *buf) { return sprintf(buf, "%u\n", be16_to_cpu(dlm_config.ci_tcp_port)); } static int dlm_check_zero_and_dlm_running(unsigned int x) { if (!x) return -EINVAL; if (dlm_lowcomms_is_running()) return -EBUSY; return 0; } static ssize_t cluster_tcp_port_store(struct config_item *item, const char *buf, size_t len) { int rc; u16 x; if (!capable(CAP_SYS_ADMIN)) return -EPERM; rc = kstrtou16(buf, 0, &x); if (rc) return rc; rc = dlm_check_zero_and_dlm_running(x); if (rc) return rc; dlm_config.ci_tcp_port = cpu_to_be16(x); return len; } CONFIGFS_ATTR(cluster_, tcp_port); static ssize_t cluster_set(unsigned int *info_field, int (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; rc = kstrtouint(buf, 0, &x); if (rc) return rc; if (check_cb) { rc = check_cb(x); if (rc) return rc; } *info_field = x; return len; } #define CLUSTER_ATTR(name, check_cb) \ static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ return cluster_set(&dlm_config.ci_##name, check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ return snprintf(buf, PAGE_SIZE, "%u\n", dlm_config.ci_##name); \ } \ CONFIGFS_ATTR(cluster_, name); static int dlm_check_protocol_and_dlm_running(unsigned int x) { switch (x) { case 0: /* TCP */ break; case 1: /* SCTP */ break; default: return -EINVAL; } if (dlm_lowcomms_is_running()) return -EBUSY; return 0; } static int dlm_check_zero(unsigned int x) { if (!x) return -EINVAL; return 0; } static int dlm_check_buffer_size(unsigned int x) { if (x < DLM_MAX_SOCKET_BUFSIZE) return -EINVAL; return 0; } CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); CLUSTER_ATTR(recover_timer, dlm_check_zero); CLUSTER_ATTR(toss_secs, dlm_check_zero); CLUSTER_ATTR(scan_secs, dlm_check_zero); CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size, [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size, [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer, [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs, [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs, [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug, [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, NULL, }; enum { COMM_ATTR_NODEID = 0, COMM_ATTR_LOCAL, COMM_ATTR_ADDR, COMM_ATTR_ADDR_LIST, COMM_ATTR_MARK, }; enum { NODE_ATTR_NODEID = 0, NODE_ATTR_WEIGHT, }; struct dlm_clusters { struct configfs_subsystem subsys; }; struct dlm_spaces { struct config_group ss_group; }; struct dlm_space { struct config_group group; struct list_head members; struct mutex members_lock; int members_count; struct dlm_nodes *nds; }; struct dlm_comms { struct config_group cs_group; }; struct dlm_comm { struct config_item item; int seq; int nodeid; int local; int addr_count; unsigned int mark; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; struct dlm_nodes { struct config_group ns_group; }; struct dlm_node { struct config_item item; struct list_head list; /* space->members */ int nodeid; int weight; int new; int comm_seq; /* copy of cm->seq when nd->nodeid is set */ }; static struct configfs_group_operations clusters_ops = { .make_group = make_cluster, .drop_item = drop_cluster, }; static struct configfs_item_operations cluster_ops = { .release = release_cluster, }; static struct configfs_group_operations spaces_ops = { .make_group = make_space, .drop_item = drop_space, }; static struct configfs_item_operations space_ops = { .release = release_space, }; static struct configfs_group_operations comms_ops = { .make_item = make_comm, .drop_item = drop_comm, }; static struct configfs_item_operations comm_ops = { .release = release_comm, }; static struct configfs_group_operations nodes_ops = { .make_item = make_node, .drop_item = drop_node, }; static struct configfs_item_operations node_ops = { .release = release_node, }; static const struct config_item_type clusters_type = { .ct_group_ops = &clusters_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type cluster_type = { .ct_item_ops = &cluster_ops, .ct_attrs = cluster_attrs, .ct_owner = THIS_MODULE, }; static const struct config_item_type spaces_type = { .ct_group_ops = &spaces_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type space_type = { .ct_item_ops = &space_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type comms_type = { .ct_group_ops = &comms_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type comm_type = { .ct_item_ops = &comm_ops, .ct_attrs = comm_attrs, .ct_owner = THIS_MODULE, }; static const struct config_item_type nodes_type = { .ct_group_ops = &nodes_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type node_type = { .ct_item_ops = &node_ops, .ct_attrs = node_attrs, .ct_owner = THIS_MODULE, }; static struct dlm_space *config_item_to_space(struct config_item *i) { return i ? container_of(to_config_group(i), struct dlm_space, group) : NULL; } static struct dlm_comm *config_item_to_comm(struct config_item *i) { return i ? container_of(i, struct dlm_comm, item) : NULL; } static struct dlm_node *config_item_to_node(struct config_item *i) { return i ? container_of(i, struct dlm_node, item) : NULL; } static struct config_group *make_cluster(struct config_group *g, const char *name) { struct dlm_cluster *cl = NULL; struct dlm_spaces *sps = NULL; struct dlm_comms *cms = NULL; cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); if (!cl || !sps || !cms) goto fail; cl->sps = sps; cl->cms = cms; config_group_init_type_name(&cl->group, name, &cluster_type); config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); config_group_init_type_name(&cms->cs_group, "comms", &comms_type); configfs_add_default_group(&sps->ss_group, &cl->group); configfs_add_default_group(&cms->cs_group, &cl->group); space_list = &sps->ss_group; comm_list = &cms->cs_group; return &cl->group; fail: kfree(cl); kfree(sps); kfree(cms); return ERR_PTR(-ENOMEM); } static void drop_cluster(struct config_group *g, struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); configfs_remove_default_groups(&cl->group); space_list = NULL; comm_list = NULL; config_item_put(i); } static void release_cluster(struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); kfree(cl->sps); kfree(cl->cms); kfree(cl); } static struct config_group *make_space(struct config_group *g, const char *name) { struct dlm_space *sp = NULL; struct dlm_nodes *nds = NULL; sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); if (!sp || !nds) goto fail; config_group_init_type_name(&sp->group, name, &space_type); config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); configfs_add_default_group(&nds->ns_group, &sp->group); INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); sp->members_count = 0; sp->nds = nds; return &sp->group; fail: kfree(sp); kfree(nds); return ERR_PTR(-ENOMEM); } static void drop_space(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); /* assert list_empty(&sp->members) */ configfs_remove_default_groups(&sp->group); config_item_put(i); } static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); kfree(sp->nds); kfree(sp); } static struct config_item *make_comm(struct config_group *g, const char *name) { struct dlm_comm *cm; unsigned int nodeid; int rv; rv = kstrtouint(name, 0, &nodeid); if (rv) return ERR_PTR(rv); cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); if (!cm) return ERR_PTR(-ENOMEM); config_item_init_type_name(&cm->item, name, &comm_type); cm->seq = dlm_comm_count++; if (!cm->seq) cm->seq = dlm_comm_count++; cm->nodeid = nodeid; cm->local = 0; cm->addr_count = 0; cm->mark = 0; return &cm->item; } static void drop_comm(struct config_group *g, struct config_item *i) { struct dlm_comm *cm = config_item_to_comm(i); if (local_comm == cm) local_comm = NULL; dlm_midcomms_close(cm->nodeid); while (cm->addr_count--) kfree(cm->addr[cm->addr_count]); config_item_put(i); } static void release_comm(struct config_item *i) { struct dlm_comm *cm = config_item_to_comm(i); kfree(cm); } static struct config_item *make_node(struct config_group *g, const char *name) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); unsigned int nodeid; struct dlm_node *nd; uint32_t seq = 0; int rv; rv = kstrtouint(name, 0, &nodeid); if (rv) return ERR_PTR(rv); nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); if (!nd) return ERR_PTR(-ENOMEM); config_item_init_type_name(&nd->item, name, &node_type); nd->nodeid = nodeid; nd->weight = 1; /* default weight of 1 if none is set */ nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */ dlm_comm_seq(nodeid, &seq, true); nd->comm_seq = seq; mutex_lock(&sp->members_lock); list_add(&nd->list, &sp->members); sp->members_count++; mutex_unlock(&sp->members_lock); return &nd->item; } static void drop_node(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd = config_item_to_node(i); mutex_lock(&sp->members_lock); list_del(&nd->list); sp->members_count--; mutex_unlock(&sp->members_lock); config_item_put(i); } static void release_node(struct config_item *i) { struct dlm_node *nd = config_item_to_node(i); kfree(nd); } static struct dlm_clusters clusters_root = { .subsys = { .su_group = { .cg_item = { .ci_namebuf = "dlm", .ci_type = &clusters_type, }, }, }, }; int __init dlm_config_init(void) { config_group_init(&clusters_root.subsys.su_group); mutex_init(&clusters_root.subsys.su_mutex); return configfs_register_subsystem(&clusters_root.subsys); } void dlm_config_exit(void) { configfs_unregister_subsystem(&clusters_root.subsys); } /* * Functions for user space to read/write attributes */ static ssize_t comm_nodeid_show(struct config_item *item, char *buf) { unsigned int nodeid; int rv; rv = kstrtouint(config_item_name(item), 0, &nodeid); if (WARN_ON(rv)) return rv; return sprintf(buf, "%u\n", nodeid); } static ssize_t comm_nodeid_store(struct config_item *item, const char *buf, size_t len) { return len; } static ssize_t comm_local_show(struct config_item *item, char *buf) { return sprintf(buf, "%d\n", config_item_to_comm(item)->local); } static ssize_t comm_local_store(struct config_item *item, const char *buf, size_t len) { struct dlm_comm *cm = config_item_to_comm(item); int rc = kstrtoint(buf, 0, &cm->local); if (rc) return rc; if (cm->local && !local_comm) local_comm = cm; return len; } static ssize_t comm_addr_store(struct config_item *item, const char *buf, size_t len) { struct dlm_comm *cm = config_item_to_comm(item); struct sockaddr_storage *addr; int rv; if (len != sizeof(struct sockaddr_storage)) return -EINVAL; if (cm->addr_count >= DLM_MAX_ADDR_COUNT) return -ENOSPC; addr = kzalloc(sizeof(*addr), GFP_NOFS); if (!addr) return -ENOMEM; memcpy(addr, buf, len); rv = dlm_midcomms_addr(cm->nodeid, addr); if (rv) { kfree(addr); return rv; } cm->addr[cm->addr_count++] = addr; return len; } static ssize_t comm_addr_list_show(struct config_item *item, char *buf) { struct dlm_comm *cm = config_item_to_comm(item); ssize_t s; ssize_t allowance; int i; struct sockaddr_storage *addr; struct sockaddr_in *addr_in; struct sockaddr_in6 *addr_in6; /* Taken from ip6_addr_string() defined in lib/vsprintf.c */ char buf0[sizeof("AF_INET6 xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")]; /* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */ allowance = 4096; buf[0] = '\0'; for (i = 0; i < cm->addr_count; i++) { addr = cm->addr[i]; switch(addr->ss_family) { case AF_INET: addr_in = (struct sockaddr_in *)addr; s = sprintf(buf0, "AF_INET %pI4\n", &addr_in->sin_addr.s_addr); break; case AF_INET6: addr_in6 = (struct sockaddr_in6 *)addr; s = sprintf(buf0, "AF_INET6 %pI6\n", &addr_in6->sin6_addr); break; default: s = sprintf(buf0, "%s\n", "<UNKNOWN>"); break; } allowance -= s; if (allowance >= 0) strcat(buf, buf0); else { allowance += s; break; } } return 4096 - allowance; } static ssize_t comm_mark_show(struct config_item *item, char *buf) { return sprintf(buf, "%u\n", config_item_to_comm(item)->mark); } static ssize_t comm_mark_store(struct config_item *item, const char *buf, size_t len) { struct dlm_comm *comm; unsigned int mark; int rc; rc = kstrtouint(buf, 0, &mark); if (rc) return rc; if (mark == 0) mark = dlm_config.ci_mark; comm = config_item_to_comm(item); rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark); if (rc) return rc; comm->mark = mark; return len; } CONFIGFS_ATTR(comm_, nodeid); CONFIGFS_ATTR(comm_, local); CONFIGFS_ATTR(comm_, mark); CONFIGFS_ATTR_WO(comm_, addr); CONFIGFS_ATTR_RO(comm_, addr_list); static struct configfs_attribute *comm_attrs[] = { [COMM_ATTR_NODEID] = &comm_attr_nodeid, [COMM_ATTR_LOCAL] = &comm_attr_local, [COMM_ATTR_ADDR] = &comm_attr_addr, [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list, [COMM_ATTR_MARK] = &comm_attr_mark, NULL, }; static ssize_t node_nodeid_show(struct config_item *item, char *buf) { unsigned int nodeid; int rv; rv = kstrtouint(config_item_name(item), 0, &nodeid); if (WARN_ON(rv)) return rv; return sprintf(buf, "%u\n", nodeid); } static ssize_t node_nodeid_store(struct config_item *item, const char *buf, size_t len) { return len; } static ssize_t node_weight_show(struct config_item *item, char *buf) { return sprintf(buf, "%d\n", config_item_to_node(item)->weight); } static ssize_t node_weight_store(struct config_item *item, const char *buf, size_t len) { int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight); if (rc) return rc; return len; } CONFIGFS_ATTR(node_, nodeid); CONFIGFS_ATTR(node_, weight); static struct configfs_attribute *node_attrs[] = { [NODE_ATTR_NODEID] = &node_attr_nodeid, [NODE_ATTR_WEIGHT] = &node_attr_weight, NULL, }; /* * Functions for the dlm to get the info that's been configured */ static struct dlm_space *get_space(char *name) { struct config_item *i; if (!space_list) return NULL; mutex_lock(&space_list->cg_subsys->su_mutex); i = config_group_find_item(space_list, name); mutex_unlock(&space_list->cg_subsys->su_mutex); return config_item_to_space(i); } static void put_space(struct dlm_space *sp) { config_item_put(&sp->group.cg_item); } static struct dlm_comm *get_comm(int nodeid) { struct config_item *i; struct dlm_comm *cm = NULL; int found = 0; if (!comm_list) return NULL; WARN_ON_ONCE(!mutex_is_locked(&clusters_root.subsys.su_mutex)); list_for_each_entry(i, &comm_list->cg_children, ci_entry) { cm = config_item_to_comm(i); if (cm->nodeid != nodeid) continue; found = 1; config_item_get(i); break; } if (!found) cm = NULL; return cm; } static void put_comm(struct dlm_comm *cm) { config_item_put(&cm->item); } /* caller must free mem */ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out) { struct dlm_space *sp; struct dlm_node *nd; struct dlm_config_node *nodes, *node; int rv, count; sp = get_space(lsname); if (!sp) return -EEXIST; mutex_lock(&sp->members_lock); if (!sp->members_count) { rv = -EINVAL; printk(KERN_ERR "dlm: zero members_count\n"); goto out; } count = sp->members_count; nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); if (!nodes) { rv = -ENOMEM; goto out; } node = nodes; list_for_each_entry(nd, &sp->members, list) { node->nodeid = nd->nodeid; node->weight = nd->weight; node->new = nd->new; node->comm_seq = nd->comm_seq; node++; nd->new = 0; } *count_out = count; *nodes_out = nodes; rv = 0; out: mutex_unlock(&sp->members_lock); put_space(sp); return rv; } int dlm_comm_seq(int nodeid, uint32_t *seq, bool locked) { struct dlm_comm *cm; if (locked) { cm = get_comm(nodeid); } else { mutex_lock(&clusters_root.subsys.su_mutex); cm = get_comm(nodeid); mutex_unlock(&clusters_root.subsys.su_mutex); } if (!cm) return -EEXIST; *seq = cm->seq; put_comm(cm); return 0; } int dlm_our_nodeid(void) { return local_comm->nodeid; } /* num 0 is first addr, num 1 is second addr */ int dlm_our_addr(struct sockaddr_storage *addr, int num) { if (!local_comm) return -1; if (num + 1 > local_comm->addr_count) return -1; memcpy(addr, local_comm->addr[num], sizeof(*addr)); return 0; } /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 #define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" struct dlm_config_info dlm_config = { .ci_tcp_port = cpu_to_be16(DEFAULT_TCP_PORT), .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, .ci_toss_secs = DEFAULT_TOSS_SECS, .ci_scan_secs = DEFAULT_SCAN_SECS, .ci_log_debug = DEFAULT_LOG_DEBUG, .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME };
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 // SPDX-License-Identifier: GPL-2.0-or-later /* * ToupTek UCMOS / AmScope MU series camera driver * TODO: contrast with ScopeTek / AmScope MDC cameras * * Copyright (C) 2012-2014 John McMaster <JohnDMcMaster@gmail.com> * * Special thanks to Bushing for helping with the decrypt algorithm and * Sean O'Sullivan / the Rensselaer Center for Open Source * Software (RCOS) for helping me learn kernel development */ #include "gspca.h" #define MODULE_NAME "touptek" MODULE_AUTHOR("John McMaster"); MODULE_DESCRIPTION("ToupTek UCMOS / Amscope MU microscope camera driver"); MODULE_LICENSE("GPL"); /* * Exposure reg is linear with exposure time * Exposure (sec), E (reg) * 0.000400, 0x0002 * 0.001000, 0x0005 * 0.005000, 0x0019 * 0.020000, 0x0064 * 0.080000, 0x0190 * 0.400000, 0x07D0 * 1.000000, 0x1388 * 2.000000, 0x2710 * * Three gain stages * 0x1000: master channel enable bit * 0x007F: low gain bits * 0x0080: medium gain bit * 0x0100: high gain bit * gain = enable * (1 + regH) * (1 + regM) * z * regL * * Gain implementation * Want to do something similar to mt9v011.c's set_balance * * Gain does not vary with resolution (checked 640x480 vs 1600x1200) * * Constant derivation: * * Raw data: * Gain, GTOP, B, R, GBOT * 1.00, 0x105C, 0x1068, 0x10C8, 0x105C * 1.20, 0x106E, 0x107E, 0x10D6, 0x106E * 1.40, 0x10C0, 0x10CA, 0x10E5, 0x10C0 * 1.60, 0x10C9, 0x10D4, 0x10F3, 0x10C9 * 1.80, 0x10D2, 0x10DE, 0x11C1, 0x10D2 * 2.00, 0x10DC, 0x10E9, 0x11C8, 0x10DC * 2.20, 0x10E5, 0x10F3, 0x11CF, 0x10E5 * 2.40, 0x10EE, 0x10FE, 0x11D7, 0x10EE * 2.60, 0x10F7, 0x11C4, 0x11DE, 0x10F7 * 2.80, 0x11C0, 0x11CA, 0x11E5, 0x11C0 * 3.00, 0x11C5, 0x11CF, 0x11ED, 0x11C5 * * zR = 0.0069605943152454778 * about 3/431 = 0.0069605568445475635 * zB = 0.0095695970695970703 * about 6/627 = 0.0095693779904306216 * zG = 0.010889328063241107 * about 6/551 = 0.010889292196007259 * about 10 bits for constant + 7 bits for value => at least 17 bit * intermediate with 32 bit ints should be fine for overflow etc * Essentially gains are in range 0-0x001FF * * However, V4L expects a main gain channel + R and B balance * To keep things simple for now saturate the values of balance is too high/low * This isn't really ideal but easy way to fit the Linux model * * Converted using gain model turns out to be quite linear: * Gain, GTOP, B, R, GBOT * 1.00, 92, 104, 144, 92 * 1.20, 110, 126, 172, 110 * 1.40, 128, 148, 202, 128 * 1.60, 146, 168, 230, 146 * 1.80, 164, 188, 260, 164 * 2.00, 184, 210, 288, 184 * 2.20, 202, 230, 316, 202 * 2.40, 220, 252, 348, 220 * 2.60, 238, 272, 376, 238 * 2.80, 256, 296, 404, 256 * 3.00, 276, 316, 436, 276 * * Maximum gain is 0x7FF * 2 * 2 => 0x1FFC (8188) * or about 13 effective bits of gain * The highest the commercial driver goes in my setup 436 * However, because could *maybe* damage circuits * limit the gain until have a reason to go higher * Solution: gain clipped and warning emitted */ #define GAIN_MAX 511 /* Frame sync is a short read */ #define BULK_SIZE 0x4000 /* MT9E001 reg names to give a rough approximation */ #define REG_COARSE_INTEGRATION_TIME_ 0x3012 #define REG_GROUPED_PARAMETER_HOLD_ 0x3022 #define REG_MODE_SELECT 0x0100 #define REG_OP_SYS_CLK_DIV 0x030A #define REG_VT_SYS_CLK_DIV 0x0302 #define REG_PRE_PLL_CLK_DIV 0x0304 #define REG_VT_PIX_CLK_DIV 0x0300 #define REG_OP_PIX_CLK_DIV 0x0308 #define REG_PLL_MULTIPLIER 0x0306 #define REG_COARSE_INTEGRATION_TIME_ 0x3012 #define REG_FRAME_LENGTH_LINES 0x0340 #define REG_FRAME_LENGTH_LINES_ 0x300A #define REG_GREEN1_GAIN 0x3056 #define REG_GREEN2_GAIN 0x305C #define REG_GROUPED_PARAMETER_HOLD 0x0104 #define REG_LINE_LENGTH_PCK_ 0x300C #define REG_MODE_SELECT 0x0100 #define REG_PLL_MULTIPLIER 0x0306 #define REG_READ_MODE 0x3040 #define REG_BLUE_GAIN 0x3058 #define REG_RED_GAIN 0x305A #define REG_RESET_REGISTER 0x301A #define REG_SCALE_M 0x0404 #define REG_SCALING_MODE 0x0400 #define REG_SOFTWARE_RESET 0x0103 #define REG_X_ADDR_END 0x0348 #define REG_X_ADDR_START 0x0344 #define REG_X_ADDR_START 0x0344 #define REG_X_OUTPUT_SIZE 0x034C #define REG_Y_ADDR_END 0x034A #define REG_Y_ADDR_START 0x0346 #define REG_Y_OUTPUT_SIZE 0x034E /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ /* How many bytes this frame */ unsigned int this_f; /* Device has separate gains for each Bayer quadrant V4L supports master gain which is referenced to G1/G2 and supplies individual balance controls for R/B */ struct v4l2_ctrl *blue; struct v4l2_ctrl *red; }; /* Used to simplify reg write error handling */ struct cmd { u16 value; u16 index; }; static const struct v4l2_pix_format vga_mode[] = { {800, 600, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .bytesperline = 800, .sizeimage = 800 * 600, .colorspace = V4L2_COLORSPACE_SRGB}, {1600, 1200, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .bytesperline = 1600, .sizeimage = 1600 * 1200, .colorspace = V4L2_COLORSPACE_SRGB}, {3264, 2448, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .bytesperline = 3264, .sizeimage = 3264 * 2448, .colorspace = V4L2_COLORSPACE_SRGB}, }; /* * As there's no known frame sync, the only way to keep synced is to try hard * to never miss any packets */ #if MAX_NURBS < 4 #error "Not enough URBs in the gspca table" #endif static int val_reply(struct gspca_dev *gspca_dev, const char *reply, int rc) { if (rc < 0) { gspca_err(gspca_dev, "reply has error %d\n", rc); return -EIO; } if (rc != 1) { gspca_err(gspca_dev, "Bad reply size %d\n", rc); return -EIO; } if (reply[0] != 0x08) { gspca_err(gspca_dev, "Bad reply 0x%02x\n", (int)reply[0]); return -EIO; } return 0; } static void reg_w(struct gspca_dev *gspca_dev, u16 value, u16 index) { char *buff = gspca_dev->usb_buf; int rc; gspca_dbg(gspca_dev, D_USBO, "reg_w bReq=0x0B, bReqT=0xC0, wVal=0x%04X, wInd=0x%04X\n\n", value, index); rc = usb_control_msg(gspca_dev->dev, usb_rcvctrlpipe(gspca_dev->dev, 0), 0x0B, 0xC0, value, index, buff, 1, 500); gspca_dbg(gspca_dev, D_USBO, "rc=%d, ret={0x%02x}\n", rc, (int)buff[0]); if (rc < 0) { gspca_err(gspca_dev, "Failed reg_w(0x0B, 0xC0, 0x%04X, 0x%04X) w/ rc %d\n", value, index, rc); gspca_dev->usb_err = rc; return; } if (val_reply(gspca_dev, buff, rc)) { gspca_err(gspca_dev, "Bad reply to reg_w(0x0B, 0xC0, 0x%04X, 0x%04X\n", value, index); gspca_dev->usb_err = -EIO; } } static void reg_w_buf(struct gspca_dev *gspca_dev, const struct cmd *p, int l) { do { reg_w(gspca_dev, p->value, p->index); p++; } while (--l > 0); } static void setexposure(struct gspca_dev *gspca_dev, s32 val) { u16 value; unsigned int w = gspca_dev->pixfmt.width; if (w == 800) value = val * 5; else if (w == 1600) value = val * 3; else if (w == 3264) value = val * 3 / 2; else { gspca_err(gspca_dev, "Invalid width %u\n", w); gspca_dev->usb_err = -EINVAL; return; } gspca_dbg(gspca_dev, D_STREAM, "exposure: 0x%04X ms\n\n", value); /* Wonder if there's a good reason for sending it twice */ /* probably not but leave it in because...why not */ reg_w(gspca_dev, value, REG_COARSE_INTEGRATION_TIME_); reg_w(gspca_dev, value, REG_COARSE_INTEGRATION_TIME_); } static int gainify(int in) { /* * TODO: check if there are any issues with corner cases * 0x000 (0):0x07F (127): regL * 0x080 (128) - 0x0FF (255): regM, regL * 0x100 (256) - max: regH, regM, regL */ if (in <= 0x7F) return 0x1000 | in; else if (in <= 0xFF) return 0x1080 | in / 2; else return 0x1180 | in / 4; } static void setggain(struct gspca_dev *gspca_dev, u16 global_gain) { u16 normalized; normalized = gainify(global_gain); gspca_dbg(gspca_dev, D_STREAM, "gain G1/G2 (0x%04X): 0x%04X (src 0x%04X)\n\n", REG_GREEN1_GAIN, normalized, global_gain); reg_w(gspca_dev, normalized, REG_GREEN1_GAIN); reg_w(gspca_dev, normalized, REG_GREEN2_GAIN); } static void setbgain(struct gspca_dev *gspca_dev, u16 gain, u16 global_gain) { u16 normalized; normalized = global_gain + ((u32)global_gain) * gain / GAIN_MAX; if (normalized > GAIN_MAX) { gspca_dbg(gspca_dev, D_STREAM, "Truncating blue 0x%04X w/ value 0x%04X\n\n", GAIN_MAX, normalized); normalized = GAIN_MAX; } normalized = gainify(normalized); gspca_dbg(gspca_dev, D_STREAM, "gain B (0x%04X): 0x%04X w/ source 0x%04X\n\n", REG_BLUE_GAIN, normalized, gain); reg_w(gspca_dev, normalized, REG_BLUE_GAIN); } static void setrgain(struct gspca_dev *gspca_dev, u16 gain, u16 global_gain) { u16 normalized; normalized = global_gain + ((u32)global_gain) * gain / GAIN_MAX; if (normalized > GAIN_MAX) { gspca_dbg(gspca_dev, D_STREAM, "Truncating gain 0x%04X w/ value 0x%04X\n\n", GAIN_MAX, normalized); normalized = GAIN_MAX; } normalized = gainify(normalized); gspca_dbg(gspca_dev, D_STREAM, "gain R (0x%04X): 0x%04X w / source 0x%04X\n\n", REG_RED_GAIN, normalized, gain); reg_w(gspca_dev, normalized, REG_RED_GAIN); } static void configure_wh(struct gspca_dev *gspca_dev) { unsigned int w = gspca_dev->pixfmt.width; gspca_dbg(gspca_dev, D_STREAM, "configure_wh\n\n"); if (w == 800) { static const struct cmd reg_init_res[] = { {0x0060, REG_X_ADDR_START}, {0x0CD9, REG_X_ADDR_END}, {0x0036, REG_Y_ADDR_START}, {0x098F, REG_Y_ADDR_END}, {0x07C7, REG_READ_MODE}, }; reg_w_buf(gspca_dev, reg_init_res, ARRAY_SIZE(reg_init_res)); } else if (w == 1600) { static const struct cmd reg_init_res[] = { {0x009C, REG_X_ADDR_START}, {0x0D19, REG_X_ADDR_END}, {0x0068, REG_Y_ADDR_START}, {0x09C5, REG_Y_ADDR_END}, {0x06C3, REG_READ_MODE}, }; reg_w_buf(gspca_dev, reg_init_res, ARRAY_SIZE(reg_init_res)); } else if (w == 3264) { static const struct cmd reg_init_res[] = { {0x00E8, REG_X_ADDR_START}, {0x0DA7, REG_X_ADDR_END}, {0x009E, REG_Y_ADDR_START}, {0x0A2D, REG_Y_ADDR_END}, {0x0241, REG_READ_MODE}, }; reg_w_buf(gspca_dev, reg_init_res, ARRAY_SIZE(reg_init_res)); } else { gspca_err(gspca_dev, "bad width %u\n", w); gspca_dev->usb_err = -EINVAL; return; } reg_w(gspca_dev, 0x0000, REG_SCALING_MODE); reg_w(gspca_dev, 0x0010, REG_SCALE_M); reg_w(gspca_dev, w, REG_X_OUTPUT_SIZE); reg_w(gspca_dev, gspca_dev->pixfmt.height, REG_Y_OUTPUT_SIZE); if (w == 800) { reg_w(gspca_dev, 0x0384, REG_FRAME_LENGTH_LINES_); reg_w(gspca_dev, 0x0960, REG_LINE_LENGTH_PCK_); } else if (w == 1600) { reg_w(gspca_dev, 0x0640, REG_FRAME_LENGTH_LINES_); reg_w(gspca_dev, 0x0FA0, REG_LINE_LENGTH_PCK_); } else if (w == 3264) { reg_w(gspca_dev, 0x0B4B, REG_FRAME_LENGTH_LINES_); reg_w(gspca_dev, 0x1F40, REG_LINE_LENGTH_PCK_); } else { gspca_err(gspca_dev, "bad width %u\n", w); gspca_dev->usb_err = -EINVAL; return; } } /* Packets that were encrypted, no idea if the grouping is significant */ static void configure_encrypted(struct gspca_dev *gspca_dev) { static const struct cmd reg_init_begin[] = { {0x0100, REG_SOFTWARE_RESET}, {0x0000, REG_MODE_SELECT}, {0x0100, REG_GROUPED_PARAMETER_HOLD}, {0x0004, REG_VT_PIX_CLK_DIV}, {0x0001, REG_VT_SYS_CLK_DIV}, {0x0008, REG_OP_PIX_CLK_DIV}, {0x0001, REG_OP_SYS_CLK_DIV}, {0x0004, REG_PRE_PLL_CLK_DIV}, {0x0040, REG_PLL_MULTIPLIER}, {0x0000, REG_GROUPED_PARAMETER_HOLD}, {0x0100, REG_GROUPED_PARAMETER_HOLD}, }; static const struct cmd reg_init_end[] = { {0x0000, REG_GROUPED_PARAMETER_HOLD}, {0x0301, 0x31AE}, {0x0805, 0x3064}, {0x0071, 0x3170}, {0x10DE, REG_RESET_REGISTER}, {0x0000, REG_MODE_SELECT}, {0x0010, REG_PLL_MULTIPLIER}, {0x0100, REG_MODE_SELECT}, }; gspca_dbg(gspca_dev, D_STREAM, "Encrypted begin, w = %u\n\n", gspca_dev->pixfmt.width); reg_w_buf(gspca_dev, reg_init_begin, ARRAY_SIZE(reg_init_begin)); configure_wh(gspca_dev); reg_w_buf(gspca_dev, reg_init_end, ARRAY_SIZE(reg_init_end)); reg_w(gspca_dev, 0x0100, REG_GROUPED_PARAMETER_HOLD); reg_w(gspca_dev, 0x0000, REG_GROUPED_PARAMETER_HOLD); gspca_dbg(gspca_dev, D_STREAM, "Encrypted end\n\n"); } static int configure(struct gspca_dev *gspca_dev) { int rc; char *buff = gspca_dev->usb_buf; gspca_dbg(gspca_dev, D_STREAM, "configure()\n\n"); /* * First driver sets a sort of encryption key * A number of futur requests of this type have wValue and wIndex * encrypted as follows: * -Compute key = this wValue rotate left by 4 bits * (decrypt.py rotates right because we are decrypting) * -Later packets encrypt packets by XOR'ing with key * XOR encrypt/decrypt is symmetrical * wValue, and wIndex are encrypted * bRequest is not and bRequestType is always 0xC0 * This allows resyncing if key is unknown? * By setting 0 we XOR with 0 and the shifting and XOR drops out */ rc = usb_control_msg(gspca_dev->dev, usb_rcvctrlpipe(gspca_dev->dev, 0), 0x16, 0xC0, 0x0000, 0x0000, buff, 2, 500); if (val_reply(gspca_dev, buff, rc)) { gspca_err(gspca_dev, "failed key req\n"); return -EIO; } /* * Next does some sort of 2 packet challenge / response * evidence suggests its an Atmel I2C crypto part but nobody cares to * look * (to make sure its not cloned hardware?) * Ignore: I want to work with their hardware, not clone it * 16 bytes out challenge, requestType: 0x40 * 16 bytes in response, requestType: 0xC0 */ rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0001, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay packet 176 w/ rc %d\n", rc); return rc; } rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0000, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay packet 178 w/ rc %d\n", rc); return rc; } rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0001, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay packet 180 w/ rc %d\n", rc); return rc; } /* * Serial number? Doesn't seem to be required * cam1: \xE6\x0D\x00\x00, cam2: \x70\x19\x00\x00 * rc = usb_control_msg(gspca_dev->dev, * usb_rcvctrlpipe(gspca_dev->dev, 0), * 0x20, 0xC0, 0x0000, 0x0000, buff, 4, 500); */ /* Large (EEPROM?) read, skip it since no idea what to do with it */ gspca_dev->usb_err = 0; configure_encrypted(gspca_dev); if (gspca_dev->usb_err) return gspca_dev->usb_err; /* Omitted this by accident, does not work without it */ rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0003, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay final packet w/ rc %d\n", rc); return rc; } gspca_dbg(gspca_dev, D_STREAM, "Configure complete\n\n"); return 0; } static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { gspca_dev->cam.cam_mode = vga_mode; gspca_dev->cam.nmodes = ARRAY_SIZE(vga_mode); /* Yes we want URBs and we want them now! */ gspca_dev->cam.no_urb_create = 0; gspca_dev->cam.bulk_nurbs = 4; /* Largest size the windows driver uses */ gspca_dev->cam.bulk_size = BULK_SIZE; /* Def need to use bulk transfers */ gspca_dev->cam.bulk = 1; return 0; } static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int rc; sd->this_f = 0; rc = configure(gspca_dev); if (rc < 0) { gspca_err(gspca_dev, "Failed configure\n"); return rc; } /* First two frames have messed up gains Drop them to avoid special cases in user apps? */ return 0; } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { struct sd *sd = (struct sd *) gspca_dev; if (len != BULK_SIZE) { /* can we finish a frame? */ if (sd->this_f + len == gspca_dev->pixfmt.sizeimage) { gspca_frame_add(gspca_dev, LAST_PACKET, data, len); gspca_dbg(gspca_dev, D_FRAM, "finish frame sz %u/%u w/ len %u\n\n", sd->this_f, gspca_dev->pixfmt.sizeimage, len); /* lost some data, discard the frame */ } else { gspca_frame_add(gspca_dev, DISCARD_PACKET, NULL, 0); gspca_dbg(gspca_dev, D_FRAM, "abort frame sz %u/%u w/ len %u\n\n", sd->this_f, gspca_dev->pixfmt.sizeimage, len); } sd->this_f = 0; } else { if (sd->this_f == 0) gspca_frame_add(gspca_dev, FIRST_PACKET, data, len); else gspca_frame_add(gspca_dev, INTER_PACKET, data, len); sd->this_f += len; } } static int sd_init(struct gspca_dev *gspca_dev) { return 0; } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *) gspca_dev; gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_EXPOSURE: setexposure(gspca_dev, ctrl->val); break; case V4L2_CID_GAIN: /* gspca_dev->gain automatically updated */ setggain(gspca_dev, gspca_dev->gain->val); break; case V4L2_CID_BLUE_BALANCE: sd->blue->val = ctrl->val; setbgain(gspca_dev, sd->blue->val, gspca_dev->gain->val); break; case V4L2_CID_RED_BALANCE: sd->red->val = ctrl->val; setrgain(gspca_dev, sd->red->val, gspca_dev->gain->val); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 4); gspca_dev->exposure = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, /* Mostly limited by URB timeouts */ /* XXX: make dynamic based on frame rate? */ V4L2_CID_EXPOSURE, 0, 800, 1, 350); gspca_dev->gain = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_GAIN, 0, 511, 1, 128); sd->blue = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BLUE_BALANCE, 0, 1023, 1, 80); sd->red = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_RED_BALANCE, 0, 1023, 1, 295); if (hdl->error) { gspca_err(gspca_dev, "Could not initialize controls\n"); return hdl->error; } return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .pkt_scan = sd_pkt_scan, }; /* Table of supported USB devices */ static const struct usb_device_id device_table[] = { /* Commented out devices should be related */ /* AS: AmScope, TT: ToupTek */ /* { USB_DEVICE(0x0547, 0x6035) }, TT UCMOS00350KPA */ /* { USB_DEVICE(0x0547, 0x6130) }, TT UCMOS01300KPA */ /* { USB_DEVICE(0x0547, 0x6200) }, TT UCMOS02000KPA */ /* { USB_DEVICE(0x0547, 0x6310) }, TT UCMOS03100KPA */ /* { USB_DEVICE(0x0547, 0x6510) }, TT UCMOS05100KPA */ /* { USB_DEVICE(0x0547, 0x6800) }, TT UCMOS08000KPA */ /* { USB_DEVICE(0x0547, 0x6801) }, TT UCMOS08000KPB */ { USB_DEVICE(0x0547, 0x6801) }, /* TT UCMOS08000KPB, AS MU800 */ /* { USB_DEVICE(0x0547, 0x6900) }, TT UCMOS09000KPA */ /* { USB_DEVICE(0x0547, 0x6901) }, TT UCMOS09000KPB */ /* { USB_DEVICE(0x0547, 0x6010) }, TT UCMOS10000KPA */ /* { USB_DEVICE(0x0547, 0x6014) }, TT UCMOS14000KPA */ /* { USB_DEVICE(0x0547, 0x6131) }, TT UCMOS01300KMA */ /* { USB_DEVICE(0x0547, 0x6511) }, TT UCMOS05100KMA */ /* { USB_DEVICE(0x0547, 0x8080) }, TT UHCCD00800KPA */ /* { USB_DEVICE(0x0547, 0x8140) }, TT UHCCD01400KPA */ /* { USB_DEVICE(0x0547, 0x8141) }, TT EXCCD01400KPA */ /* { USB_DEVICE(0x0547, 0x8200) }, TT UHCCD02000KPA */ /* { USB_DEVICE(0x0547, 0x8201) }, TT UHCCD02000KPB */ /* { USB_DEVICE(0x0547, 0x8310) }, TT UHCCD03100KPA */ /* { USB_DEVICE(0x0547, 0x8500) }, TT UHCCD05000KPA */ /* { USB_DEVICE(0x0547, 0x8510) }, TT UHCCD05100KPA */ /* { USB_DEVICE(0x0547, 0x8600) }, TT UHCCD06000KPA */ /* { USB_DEVICE(0x0547, 0x8800) }, TT UHCCD08000KPA */ /* { USB_DEVICE(0x0547, 0x8315) }, TT UHCCD03150KPA */ /* { USB_DEVICE(0x0547, 0x7800) }, TT UHCCD00800KMA */ /* { USB_DEVICE(0x0547, 0x7140) }, TT UHCCD01400KMA */ /* { USB_DEVICE(0x0547, 0x7141) }, TT UHCCD01400KMB */ /* { USB_DEVICE(0x0547, 0x7200) }, TT UHCCD02000KMA */ /* { USB_DEVICE(0x0547, 0x7315) }, TT UHCCD03150KMA */ { } }; MODULE_DEVICE_TABLE(usb, device_table); static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, #endif }; static int __init sd_mod_init(void) { int ret; ret = usb_register(&sd_driver); if (ret < 0) return ret; return 0; } static void __exit sd_mod_exit(void) { usb_deregister(&sd_driver); } module_init(sd_mod_init); module_exit(sd_mod_exit);
8 18 1 1 1 15 1 8 1 1 1 1 1 16 16 15 8 8 8 9 8 8 9 1 8 4 4 8 8 8 4 8 8 1 1 1 1 3 1 1 1 1 8 8 8 16 4 4 4 4 16 16 1 1 1 1 16 5 6 1 6 5 5 1 1 1 6 6 6 8 8 16 16 8 8 8 16 14 14 14 13 16 16 16 16 4 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/if_vlan.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <rdma/ib_cache.h> #include "core_priv.h" struct ib_pkey_cache { int table_len; u16 table[] __counted_by(table_len); }; struct ib_update_work { struct work_struct work; struct ib_event event; bool enforce_security; }; union ib_gid zgid; EXPORT_SYMBOL(zgid); enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_state { GID_TABLE_ENTRY_INVALID = 1, GID_TABLE_ENTRY_VALID = 2, /* * Indicates that entry is pending to be removed, there may * be active users of this GID entry. * When last user of the GID entry releases reference to it, * GID entry is detached from the table. */ GID_TABLE_ENTRY_PENDING_DEL = 3, }; struct roce_gid_ndev_storage { struct rcu_head rcu_head; struct net_device *ndev; }; struct ib_gid_table_entry { struct kref kref; struct work_struct del_work; struct ib_gid_attr attr; void *context; /* Store the ndev pointer to release reference later on in * call_rcu context because by that time gid_table_entry * and attr might be already freed. So keep a copy of it. * ndev_storage is freed by rcu callback. */ struct roce_gid_ndev_storage *ndev_storage; enum gid_table_entry_state state; }; struct ib_gid_table { int sz; /* In RoCE, adding a GID to the table requires: * (a) Find if this GID is already exists. * (b) Find a free space. * (c) Write the new GID * * Delete requires different set of operations: * (a) Find the GID * (b) Delete it. * **/ /* Any writer to data_vec must hold this lock and the write side of * rwlock. Readers must hold only rwlock. All writers must be in a * sleepable context. */ struct mutex lock; /* rwlock protects data_vec[ix]->state and entry pointer. */ rwlock_t rwlock; struct ib_gid_table_entry **data_vec; /* bit field, each bit indicates the index of default GID */ u32 default_gid_indices; }; static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) { struct ib_event event; event.device = ib_dev; event.element.port_num = port; event.event = IB_EVENT_GID_CHANGE; ib_dispatch_event_clients(&event); } static const char * const gid_type_str[] = { /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for * user space compatibility reasons. */ [IB_GID_TYPE_IB] = "IB/RoCE v1", [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) { if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) return gid_type_str[gid_type]; return "Invalid GID type"; } EXPORT_SYMBOL(ib_cache_gid_type_str); /** rdma_is_zero_gid - Check if given GID is zero or not. * @gid: GID to check * Returns true if given GID is zero, returns false otherwise. */ bool rdma_is_zero_gid(const union ib_gid *gid) { return !memcmp(gid, &zgid, sizeof(*gid)); } EXPORT_SYMBOL(rdma_is_zero_gid); /** is_gid_index_default - Check if a given index belongs to * reserved default GIDs or not. * @table: GID table pointer * @index: Index to check in GID table * Returns true if index is one of the reserved default GID index otherwise * returns false. */ static bool is_gid_index_default(const struct ib_gid_table *table, unsigned int index) { return index < 32 && (BIT(index) & table->default_gid_indices); } int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; size_t len; int err = -EINVAL; len = strlen(buf); if (len == 0) return -EINVAL; if (buf[len - 1] == '\n') len--; for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && len == strlen(gid_type_str[i])) { err = i; break; } return err; } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) { return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) { return !entry; } static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) { return entry && entry->state == GID_TABLE_ENTRY_VALID; } static void schedule_free_gid(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); queue_work(ib_wq, &entry->del_work); } static void put_gid_ndev(struct rcu_head *head) { struct roce_gid_ndev_storage *storage = container_of(head, struct roce_gid_ndev_storage, rcu_head); WARN_ON(!storage->ndev); /* At this point its safe to release netdev reference, * as all callers working on gid_attr->ndev are done * using this netdev. */ dev_put(storage->ndev); kfree(storage); } static void free_gid_entry_locked(struct ib_gid_table_entry *entry) { struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, port_num, entry->attr.index, entry->attr.gid.raw); write_lock_irq(&table->rwlock); /* * The only way to avoid overwriting NULL in table is * by comparing if it is same entry in table or not! * If new entry in table is added by the time we free here, * don't overwrite the table entry. */ if (entry == table->data_vec[entry->attr.index]) table->data_vec[entry->attr.index] = NULL; /* Now this index is ready to be allocated */ write_unlock_irq(&table->rwlock); if (entry->ndev_storage) call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); kfree(entry); } static void free_gid_entry(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); free_gid_entry_locked(entry); } /** * free_gid_work - Release reference to the GID entry * @work: Work structure to refer to GID entry which needs to be * deleted. * * free_gid_work() frees the entry from the HCA's hardware table * if provider supports it. It releases reference to netdevice. */ static void free_gid_work(struct work_struct *work) { struct ib_gid_table_entry *entry = container_of(work, struct ib_gid_table_entry, del_work); struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); mutex_lock(&table->lock); free_gid_entry_locked(entry); mutex_unlock(&table->lock); } static struct ib_gid_table_entry * alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; struct net_device *ndev; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return NULL; ndev = rcu_dereference_protected(attr->ndev, 1); if (ndev) { entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), GFP_KERNEL); if (!entry->ndev_storage) { kfree(entry); return NULL; } dev_hold(ndev); entry->ndev_storage->ndev = ndev; } kref_init(&entry->kref); memcpy(&entry->attr, attr, sizeof(*attr)); INIT_WORK(&entry->del_work, free_gid_work); entry->state = GID_TABLE_ENTRY_INVALID; return entry; } static void store_gid_entry(struct ib_gid_table *table, struct ib_gid_table_entry *entry) { entry->state = GID_TABLE_ENTRY_VALID; dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", __func__, entry->attr.port_num, entry->attr.index, entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); table->data_vec[entry->attr.index] = entry; write_unlock_irq(&table->rwlock); } static void get_gid_entry(struct ib_gid_table_entry *entry) { kref_get(&entry->kref); } static void put_gid_entry(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, schedule_free_gid); } static void put_gid_entry_locked(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, free_gid_entry); } static int add_roce_gid(struct ib_gid_table_entry *entry) { const struct ib_gid_attr *attr = &entry->attr; int ret; if (!attr->ndev) { dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->ops.add_gid(attr, &entry->context); if (ret) { dev_err(&attr->device->dev, "%s GID add failed port=%u index=%u\n", __func__, attr->port_num, attr->index); return ret; } } return 0; } /** * del_gid - Delete GID table entry * * @ib_dev: IB device whose GID entry to be deleted * @port: Port number of the IB device * @table: GID table of the IB device for a port * @ix: GID entry index to delete * */ static void del_gid(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table, int ix) { struct roce_gid_ndev_storage *ndev_storage; struct ib_gid_table_entry *entry; lockdep_assert_held(&table->lock); dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; entry->state = GID_TABLE_ENTRY_PENDING_DEL; /* * For non RoCE protocol, GID entry slot is ready to use. */ if (!rdma_protocol_roce(ib_dev, port)) table->data_vec[ix] = NULL; write_unlock_irq(&table->rwlock); if (rdma_cap_roce_gid_table(ib_dev, port)) ib_dev->ops.del_gid(&entry->attr, &entry->context); ndev_storage = entry->ndev_storage; if (ndev_storage) { entry->ndev_storage = NULL; rcu_assign_pointer(entry->attr.ndev, NULL); call_rcu(&ndev_storage->rcu_head, put_gid_ndev); } put_gid_entry_locked(entry); } /** * add_modify_gid - Add or modify GID table entry * * @table: GID table in which GID to be added or modified * @attr: Attributes of the GID * * Returns 0 on success or appropriate error code. It accepts zero * GID addition for non RoCE ports for HCA's who report them as valid * GID. However such zero GIDs are not added to the cache. */ static int add_modify_gid(struct ib_gid_table *table, const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; int ret = 0; /* * Invalidate any old entry in the table to make it safe to write to * this index. */ if (is_gid_entry_valid(table->data_vec[attr->index])) del_gid(attr->device, attr->port_num, table, attr->index); /* * Some HCA's report multiple GID entries with only one valid GID, and * leave other unused entries as the zero GID. Convert zero GIDs to * empty table entries instead of storing them. */ if (rdma_is_zero_gid(&attr->gid)) return 0; entry = alloc_gid_entry(attr); if (!entry) return -ENOMEM; if (rdma_protocol_roce(attr->device, attr->port_num)) { ret = add_roce_gid(entry); if (ret) goto done; } store_gid_entry(table, entry); return 0; done: put_gid_entry(entry); return ret; } /* rwlock should be read locked, or lock should be held */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask, int *pempty) { int i = 0; int found = -1; int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { struct ib_gid_table_entry *data = table->data_vec[i]; struct ib_gid_attr *attr; int curr_index = i; i++; /* find_gid() is used during GID addition where it is expected * to return a free entry slot which is not duplicate. * Free entry slot is requested and returned if pempty is set, * so lookup free slot only if requested. */ if (pempty && empty < 0) { if (is_gid_entry_free(data) && default_gid == is_gid_index_default(table, curr_index)) { /* * Found an invalid (free) entry; allocate it. * If default GID is requested, then our * found slot must be one of the DEFAULT * reserved slots or we fail. * This ensures that only DEFAULT reserved * slots are used for default property GIDs. */ empty = curr_index; } } /* * Additionally find_gid() is used to find valid entry during * lookup operation; so ignore the entries which are marked as * pending for removal and the entries which are marked as * invalid. */ if (!is_gid_entry_valid(data)) continue; if (found >= 0) continue; attr = &data->attr; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && memcmp(gid, &data->attr.gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && is_gid_index_default(table, curr_index) != default_gid) continue; found = curr_index; } if (pempty) *pempty = empty; return found; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); addrconf_ifid_eui48(&gid->raw[8], dev); } static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int empty; int ix; /* Do not allow adding zero GID in support of * IB spec version 1.3 section 4.1.1 point (6) and * section 12.7.10 and section 12.7.20 */ if (rdma_is_zero_gid(gid)) return -EINVAL; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, &empty); if (ix >= 0) goto out_unlock; if (empty < 0) { ret = -ENOSPC; goto out_unlock; } attr->device = ib_dev; attr->index = empty; attr->port_num = port; attr->gid = *gid; ret = add_modify_gid(table, attr); if (!ret) dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_warn("%s: unable to add gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV; return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); } static int _ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int ix; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, NULL); if (ix < 0) { ret = -EINVAL; goto out_unlock; } del_gid(ib_dev, port, table, ix); dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_debug("%s: can't delete gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); } int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct ib_gid_table *table; int ix; bool deleted = false; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); for (ix = 0; ix < table->sz; ix++) { if (is_gid_entry_valid(table->data_vec[ix]) && table->data_vec[ix]->attr.ndev == ndev) { del_gid(ib_dev, port, table, ix); deleted = true; } } mutex_unlock(&table->lock); if (deleted) dispatch_gid_change_event(ib_dev, port); return 0; } /** * rdma_find_gid_by_port - Returns the GID entry attributes when it finds * a valid GID entry for given search parameters. It searches for the specified * GID value in the local software cache. * @ib_dev: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @port: The port number of the device where the GID value should be searched. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * Returns sgid attributes if the GID is found with valid reference or * returns ERR_PTR for the error. * The caller must invoke rdma_put_gid_attr() to release the reference. */ const struct ib_gid_attr * rdma_find_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, u32 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; const struct ib_gid_attr *attr; unsigned long flags; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-ENOENT); table = rdma_gid_table(ib_dev, port); if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { get_gid_entry(table->data_vec[local_index]); attr = &table->data_vec[local_index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid_by_port); /** * rdma_find_gid_by_filter - Returns the GID table attribute where a * specified GID value occurs * @ib_dev: The device to query. * @gid: The GID value to search for. * @port: The port number of the device where the GID value could be * searched. * @filter: The filter function is executed on any matching GID in the table. * If the filter function returns true, the corresponding index is returned, * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. * @context: Private data to pass into the call-back. * * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. * */ const struct ib_gid_attr *rdma_find_gid_by_filter( struct ib_device *ib_dev, const union ib_gid *gid, u32 port, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context) { const struct ib_gid_attr *res = ERR_PTR(-ENOENT); struct ib_gid_table *table; unsigned long flags; unsigned int i; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-EINVAL); table = rdma_gid_table(ib_dev, port); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_table_entry *entry = table->data_vec[i]; if (!is_gid_entry_valid(entry)) continue; if (memcmp(gid, &entry->attr.gid, sizeof(*gid))) continue; if (filter(gid, &entry->attr, context)) { get_gid_entry(entry); res = &entry->attr; break; } } read_unlock_irqrestore(&table->rwlock, flags); return res; } static struct ib_gid_table *alloc_gid_table(int sz) { struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); if (!table->data_vec) goto err_free_table; mutex_init(&table->lock); table->sz = sz; rwlock_init(&table->rwlock); return table; err_free_table: kfree(table); return NULL; } static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { int i; if (!table) return; for (i = 0; i < table->sz; i++) { if (is_gid_entry_free(table->data_vec[i])) continue; WARN_ONCE(true, "GID entry ref leak for dev %s index %d ref=%u\n", dev_name(&device->dev), i, kref_read(&table->data_vec[i]->kref)); } mutex_destroy(&table->lock); kfree(table->data_vec); kfree(table); } static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { int i; if (!table) return; mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { if (is_gid_entry_valid(table->data_vec[i])) del_gid(ib_dev, port, table, i); } mutex_unlock(&table->lock); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { union ib_gid gid = { }; struct ib_gid_attr gid_attr; unsigned int gid_type; unsigned long mask; mask = GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { if (1UL << gid_type & ~gid_type_mask) continue; gid_attr.gid_type = gid_type; if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { make_default_gid(ndev, &gid); __ib_cache_gid_add(ib_dev, port, &gid, &gid_attr, mask, true); } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { _ib_cache_gid_del(ib_dev, port, &gid, &gid_attr, mask, true); } } } static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { unsigned int i; unsigned long roce_gid_type_mask; unsigned int num_default_gids; roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); /* Reserve starting indices for default GIDs */ for (i = 0; i < num_default_gids && i < table->sz; i++) table->default_gid_indices |= BIT(i); } static void gid_table_release_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) { release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { struct ib_gid_table *table; u32 rdma_port; rdma_for_each_port (ib_dev, rdma_port) { table = alloc_gid_table( ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); ib_dev->port_data[rdma_port].cache.gid = table; } return 0; rollback_table_setup: gid_table_release_one(ib_dev); return -ENOMEM; } static void gid_table_cleanup_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) cleanup_gid_table_port(ib_dev, p, ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) { int err; err = _gid_table_setup_one(ib_dev); if (err) return err; rdma_roce_rescan_device(ib_dev); return err; } /** * rdma_query_gid - Read the GID content from the GID software cache * @device: Device to query the GID * @port_num: Port number of the device * @index: Index of the GID table entry to read * @gid: Pointer to GID where to store the entry's GID * * rdma_query_gid() only reads the GID entry content for requested device, * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't * hold any reference to the GID table entry in the HCA or software cache. * * Returns 0 on success or appropriate error code. * */ int rdma_query_gid(struct ib_device *device, u32 port_num, int index, union ib_gid *gid) { struct ib_gid_table *table; unsigned long flags; int res; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); if (index < 0 || index >= table->sz) { res = -EINVAL; goto done; } if (!is_gid_entry_valid(table->data_vec[index])) { res = -ENOENT; goto done; } memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); res = 0; done: read_unlock_irqrestore(&table->rwlock, flags); return res; } EXPORT_SYMBOL(rdma_query_gid); /** * rdma_read_gid_hw_context - Read the HW GID context from GID attribute * @attr: Potinter to the GID attribute * * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding * to the SGID attr. Callers are required to already be holding the reference * to an existing GID entry. * * Returns the HW GID context * */ void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) { return container_of(attr, struct ib_gid_table_entry, attr)->context; } EXPORT_SYMBOL(rdma_read_gid_hw_context); /** * rdma_find_gid - Returns SGID attributes if the matching GID is found. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * rdma_find_gid() searches for the specified GID value in the software cache. * * Returns GID attributes if a valid GID is found or returns ERR_PTR for the * error. The caller must invoke rdma_put_gid_attr() to release the reference. * */ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, struct net_device *ndev) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; u32 p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { const struct ib_gid_attr *attr; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, u32 port_num, int index, u16 *pkey) { struct ib_pkey_cache *cache; unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, u64 *sn_pfx) { unsigned long flags; read_lock_irqsave(&device->cache_lock, flags); *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache_lock, flags); } EXPORT_SYMBOL(ib_get_cached_subnet_prefix); int ib_find_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; int partial_ix = -1; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { if (cache->table[i] & 0x8000) { *index = i; ret = 0; break; } else { partial_ix = i; } } if (ret && partial_ix >= 0) { *index = partial_ix; ret = 0; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if (cache->table[i] == pkey) { *index = i; ret = 0; break; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_exact_cached_pkey); int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *lmc = device->port_data[port_num].cache.lmc; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_lmc); int ib_get_cached_port_state(struct ib_device *device, u32 port_num, enum ib_port_state *port_state) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *port_state = device->port_data[port_num].cache.port_state; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_port_state); /** * rdma_get_gid_attr - Returns GID attributes for a port of a device * at a requested gid_index, if a valid GID entry exists. * @device: The device to query. * @port_num: The port number on the device where the GID value * is to be queried. * @index: Index of the GID table entry whose attributes are to * be queried. * * rdma_get_gid_attr() acquires reference count of gid attributes from the * cached GID table. Caller must invoke rdma_put_gid_attr() to release * reference to gid attribute regardless of link layer. * * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error * code. */ const struct ib_gid_attr * rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) { const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); struct ib_gid_table *table; unsigned long flags; if (!rdma_is_port_valid(device, port_num)) return ERR_PTR(-EINVAL); table = rdma_gid_table(device, port_num); if (index < 0 || index >= table->sz) return ERR_PTR(-EINVAL); read_lock_irqsave(&table->rwlock, flags); if (!is_gid_entry_valid(table->data_vec[index])) goto done; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; done: read_unlock_irqrestore(&table->rwlock, flags); return attr; } EXPORT_SYMBOL(rdma_get_gid_attr); /** * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. * @device: The device to query. * @entries: Entries where GID entries are returned. * @max_entries: Maximum number of entries that can be returned. * Entries array must be allocated to hold max_entries number of entries. * * Returns number of entries on success or appropriate error code. */ ssize_t rdma_query_gid_table(struct ib_device *device, struct ib_uverbs_gid_entry *entries, size_t max_entries) { const struct ib_gid_attr *gid_attr; ssize_t num_entries = 0, ret; struct ib_gid_table *table; u32 port_num, i; struct net_device *ndev; unsigned long flags; rdma_for_each_port(device, port_num) { table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { if (!is_gid_entry_valid(table->data_vec[i])) continue; if (num_entries >= max_entries) { ret = -EINVAL; goto err; } gid_attr = &table->data_vec[i]->attr; memcpy(&entries->gid, &gid_attr->gid, sizeof(gid_attr->gid)); entries->gid_index = gid_attr->index; entries->port_num = gid_attr->port_num; entries->gid_type = gid_attr->gid_type; ndev = rcu_dereference_protected( gid_attr->ndev, lockdep_is_held(&table->rwlock)); if (ndev) entries->netdev_ifindex = ndev->ifindex; num_entries++; entries++; } read_unlock_irqrestore(&table->rwlock, flags); } return num_entries; err: read_unlock_irqrestore(&table->rwlock, flags); return ret; } EXPORT_SYMBOL(rdma_query_gid_table); /** * rdma_put_gid_attr - Release reference to the GID attribute * @attr: Pointer to the GID attribute whose reference * needs to be released. * * rdma_put_gid_attr() must be used to release reference whose * reference is acquired using rdma_get_gid_attr() or any APIs * which returns a pointer to the ib_gid_attr regardless of link layer * of IB or RoCE. * */ void rdma_put_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); put_gid_entry(entry); } EXPORT_SYMBOL(rdma_put_gid_attr); /** * rdma_hold_gid_attr - Get reference to existing GID attribute * * @attr: Pointer to the GID attribute whose reference * needs to be taken. * * Increase the reference count to a GID attribute to keep it from being * freed. Callers are required to already be holding a reference to attribute. * */ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); get_gid_entry(entry); } EXPORT_SYMBOL(rdma_hold_gid_attr); /** * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice * which must be in UP state. * * @attr:Pointer to the GID attribute * * Returns pointer to netdevice if the netdevice was attached to GID and * netdevice is in UP state. Caller must hold RCU lock as this API * reads the netdev flags which can change while netdevice migrates to * different net namespace. Returns ERR_PTR with error code otherwise. * */ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); struct ib_device *device = entry->attr.device; struct net_device *ndev = ERR_PTR(-EINVAL); u32 port_num = entry->attr.port_num; struct ib_gid_table *table; unsigned long flags; bool valid; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); valid = is_gid_entry_valid(table->data_vec[attr->index]); if (valid) { ndev = rcu_dereference(attr->ndev); if (!ndev) ndev = ERR_PTR(-ENODEV); } read_unlock_irqrestore(&table->rwlock, flags); return ndev; } EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); static int get_lower_dev_vlan(struct net_device *lower_dev, struct netdev_nested_priv *priv) { u16 *vlan_id = (u16 *)priv->data; if (is_vlan_dev(lower_dev)) *vlan_id = vlan_dev_vlan_id(lower_dev); /* We are interested only in first level vlan device, so * always return 1 to stop iterating over next level devices. */ return 1; } /** * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address * of a GID entry. * * @attr: GID attribute pointer whose L2 fields to be read * @vlan_id: Pointer to vlan id to fill up if the GID entry has * vlan id. It is optional. * @smac: Pointer to smac to fill up for a GID entry. It is optional. * * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id * (if gid entry has vlan) and source MAC, or returns error. */ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, u16 *vlan_id, u8 *smac) { struct netdev_nested_priv priv = { .data = (void *)vlan_id, }; struct net_device *ndev; rcu_read_lock(); ndev = rcu_dereference(attr->ndev); if (!ndev) { rcu_read_unlock(); return -ENODEV; } if (smac) ether_addr_copy(smac, ndev->dev_addr); if (vlan_id) { *vlan_id = 0xffff; if (is_vlan_dev(ndev)) { *vlan_id = vlan_dev_vlan_id(ndev); } else { /* If the netdev is upper device and if it's lower * device is vlan device, consider vlan id of * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, get_lower_dev_vlan, &priv); } } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(rdma_read_gid_l2_fields); static int config_non_roce_gid_cache(struct ib_device *device, u32 port, struct ib_port_attr *tprops) { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; int ret = 0; int i; gid_attr.device = device; gid_attr.port_num = port; table = rdma_gid_table(device, port); mutex_lock(&table->lock); for (i = 0; i < tprops->gid_tbl_len; ++i) { if (!device->ops.query_gid) continue; ret = device->ops.query_gid(device, port, i, &gid_attr.gid); if (ret) { dev_warn(&device->dev, "query_gid failed (%d) for index %d\n", ret, i); goto err; } if (rdma_protocol_iwarp(device, port)) { struct net_device *ndev; ndev = ib_device_get_netdev(device, port); if (!ndev) continue; RCU_INIT_POINTER(gid_attr.ndev, ndev); dev_put(ndev); } gid_attr.index = i; tprops->subnet_prefix = be64_to_cpu(gid_attr.gid.global.subnet_prefix); add_modify_gid(table, &gid_attr); } err: mutex_unlock(&table->lock); return ret; } static int ib_cache_update(struct ib_device *device, u32 port, bool update_gids, bool update_pkeys, bool enforce_security) { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL; struct ib_pkey_cache *old_pkey_cache = NULL; int i; int ret; if (!rdma_is_port_valid(device, port)) return -EINVAL; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) return -ENOMEM; ret = ib_query_port(device, port, tprops); if (ret) { dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } if (!rdma_protocol_roce(device, port) && update_gids) { ret = config_non_roce_gid_cache(device, port, tprops); if (ret) goto err; } update_pkeys &= !!tprops->pkey_tbl_len; if (update_pkeys) { pkey_cache = kmalloc(struct_size(pkey_cache, table, tprops->pkey_tbl_len), GFP_KERNEL); if (!pkey_cache) { ret = -ENOMEM; goto err; } pkey_cache->table_len = tprops->pkey_tbl_len; for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { dev_warn(&device->dev, "ib_query_pkey failed (%d) for index %d\n", ret, i); goto err; } } } write_lock_irq(&device->cache_lock); if (update_pkeys) { old_pkey_cache = device->port_data[port].cache.pkey; device->port_data[port].cache.pkey = pkey_cache; } device->port_data[port].cache.lmc = tprops->lmc; device->port_data[port].cache.port_state = tprops->state; device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; write_unlock_irq(&device->cache_lock); if (enforce_security) ib_security_cache_change(device, port, tprops->subnet_prefix); kfree(old_pkey_cache); kfree(tprops); return 0; err: kfree(pkey_cache); kfree(tprops); return ret; } static void ib_cache_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); int ret; /* Before distributing the cache update event, first sync * the cache. */ ret = ib_cache_update(work->event.device, work->event.element.port_num, work->event.event == IB_EVENT_GID_CHANGE, work->event.event == IB_EVENT_PKEY_CHANGE, work->enforce_security); /* GID event is notified already for individual GID entries by * dispatch_gid_change_event(). Hence, notifiy for rest of the * events. */ if (!ret && work->event.event != IB_EVENT_GID_CHANGE) ib_dispatch_event_clients(&work->event); kfree(work); } static void ib_generic_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); ib_dispatch_event_clients(&work->event); kfree(work); } static bool is_cache_update_event(const struct ib_event *event) { return (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_GID_CHANGE); } /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(const struct ib_event *event) { struct ib_update_work *work; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; if (is_cache_update_event(event)) INIT_WORK(&work->work, ib_cache_event_task); else INIT_WORK(&work->work, ib_generic_event_task); work->event = *event; if (event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_GID_CHANGE) work->enforce_security = true; queue_work(ib_wq, &work->work); } EXPORT_SYMBOL(ib_dispatch_event); int ib_cache_setup_one(struct ib_device *device) { u32 p; int err; err = gid_table_setup_one(device); if (err) return err; rdma_for_each_port (device, p) { err = ib_cache_update(device, p, true, true, true); if (err) { gid_table_cleanup_one(device); return err; } } return 0; } void ib_cache_release_one(struct ib_device *device) { u32 p; /* * The release function