| 89 90 84 96 96 90 90 90 90 89 90 5 5 120 434 453 454 142 406 21 393 4 407 11 395 407 124 124 1 1 120 2 203 193 5 196 196 201 142 183 182 307 307 8 8 4 1 201 201 195 3 30 84 90 1 89 1 1 1 124 351 353 353 353 337 16 352 352 352 352 818 702 27 90 286 66 209 28 8 236 8 7 2 4 26 26 1 1 2 5 9 10 17 1 4 18 2 21 19 18 1 15 1 2 2 15 1 577 578 2 2 2 2 3 3 218 216 199 192 8 197 195 188 3 13 5 1 4 2 2 2 206 198 7 4 2 3 3 3 5 1 1 2 3 3 16 9 8 14 2 16 12 5 1 11 11 1 16 2 144 4 2 142 132 18 144 176 23 8 146 176 179 119 109 4 5 154 154 132 312 314 1 1 303 2 8 1 6 5 280 8 26 307 2 6 298 298 26 272 299 229 35 3 1 2 5 1 1 149 164 19 151 141 18 171 13 4 166 164 3 166 119 194 52 141 5 189 30 160 1092 1092 3 1084 54 23 40 54 1031 15 4 64 1090 1087 8 10 10 2 2 2 373 375 1 1 374 374 374 369 3 4 5 14 5 368 2 375 246 222 372 14 52 52 52 52 18 17 1 334 334 332 5 333 1 50 1 1 1 200 200 49 49 785 785 333 335 333 333 333 332 334 770 17 563 572 766 298 298 297 5 5 298 174 60 117 96 22 116 114 64 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: semantics. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/netlink.h> #include <linux/hash.h> #include <linux/nospec.h> #include <net/arp.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/addrconf.h> #include "fib_lookup.h" /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ /* Hope, that gcc will optimize it to get rid of dummy loop */ #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ for (nhsel = 0; nhsel < 1; nhsel++) #define change_nexthops(fi) { \ int nhsel; \ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ for (nhsel = 0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ #define endfor_nexthops(fi) } const struct fib_prop fib_props[RTN_MAX + 1] = { [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, }, [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST, }, [RTN_BROADCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_ANYCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_MULTICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE, }, [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE, }, [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE, }, [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE, }, [RTN_NAT] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, }; static void rt_fibinfo_free(struct rtable __rcu **rtp) { struct rtable *rt = rcu_dereference_protected(*rtp, 1); if (!rt) return; /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); * because we waited an RCU grace period before calling * free_fib_info_rcu() */ dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } static void free_nh_exceptions(struct fib_nh_common *nhc) { struct fnhe_hash_bucket *hash; int i; hash = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!hash) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; next = rcu_dereference_protected(fnhe->fnhe_next, 1); rt_fibinfo_free(&fnhe->fnhe_rth_input); rt_fibinfo_free(&fnhe->fnhe_rth_output); kfree(fnhe); fnhe = next; } } kfree(hash); } static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) { int cpu; if (!rtp) return; for_each_possible_cpu(cpu) { struct rtable *rt; rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); if (rt) { dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } } free_percpu(rtp); } void fib_nh_common_release(struct fib_nh_common *nhc) { netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker); lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); free_nh_exceptions(nhc); } EXPORT_SYMBOL_GPL(fib_nh_common_release); void fib_nh_release(struct net *net, struct fib_nh *fib_nh) { #ifdef CONFIG_IP_ROUTE_CLASSID if (fib_nh->nh_tclassid) atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif fib_nh_common_release(&fib_nh->nh_common); } /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); if (fi->nh) { nexthop_put(fi->nh); } else { change_nexthops(fi) { fib_nh_release(fi->fib_net, nexthop_nh); } endfor_nexthops(fi); } ip_fib_metrics_put(fi->fib_metrics); kfree(fi); } void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { pr_warn("Freeing alive fib_info %p\n", fi); return; } call_rcu_hurry(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); if (fi->nh) { list_del(&fi->nh_list); } else { change_nexthops(fi) { if (!nexthop_nh->fib_nh_dev) continue; hlist_del_rcu(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } /* Paired with READ_ONCE() from fib_table_lookup() */ WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } } static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { const struct fib_nh *onh; if (fi->nh || ofi->nh) return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; if (ofi->fib_nhs == 0) return 0; for_nexthops(fi) { onh = fib_info_nh(ofi, nhsel); if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight != onh->fib_nh_weight || #endif #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; if (nh->fib_nh_gw_family == AF_INET && nh->fib_nh_gw4 != onh->fib_nh_gw4) return -1; if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; } endfor_nexthops(fi); return 0; } static struct hlist_head *fib_nh_head(struct net_device *dev) { return &dev->fib_nh_head; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, u32 prefsrc, u32 priority) { unsigned int val = init_val; val ^= (protocol << 8) | scope; val ^= prefsrc; val ^= priority; return val; } static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, fi->fib_scope, (__force u32)fi->fib_prefsrc, fi->fib_priority); if (fi->nh) { val ^= fi->nh->id; } else { for_nexthops(fi) { val ^= nh->fib_nh_oif; } endfor_nexthops(fi) } return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; } static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, __be32 val) { unsigned int hash_bits = net->ipv4.fib_info_hash_bits; u32 slot; slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; } static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) { /* The second half is used for prefsrc */ return kvzalloc_objs(struct hlist_head, (1 << hash_bits) * 2); } static void fib_info_hash_free(struct hlist_head *head) { kvfree(head); } static void fib_info_hash_grow(struct net *net) { unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; struct hlist_head *new_info_hash, *old_info_hash; unsigned int i; if (net->ipv4.fib_info_cnt < old_size) return; new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); if (!new_info_hash) return; old_info_hash = net->ipv4.fib_info_hash; net->ipv4.fib_info_hash = new_info_hash; net->ipv4.fib_info_hash_bits += 1; for (i = 0; i < old_size; i++) { struct hlist_head *head = &old_info_hash[i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, head, fib_hash) hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); } for (i = 0; i < old_size; i++) { struct hlist_head *lhead = &old_info_hash[old_size + i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) hlist_add_head(&fi->fib_lhash, fib_info_laddrhash_bucket(fi->fib_net, fi->fib_prefsrc)); } fib_info_hash_free(old_info_hash); } /* no metrics, only nexthop id */ static struct fib_info *fib_find_info_nh(struct net *net, const struct fib_config *cfg) { struct hlist_head *head; struct fib_info *fi; unsigned int hash; hash = fib_info_hashfn_1(cfg->fc_nh_id, cfg->fc_protocol, cfg->fc_scope, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && cfg->fc_priority == fi->fib_priority && cfg->fc_type == fi->fib_type && cfg->fc_table == fi->fib_tb_id && !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) return fi; } return NULL; } static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head = fib_info_hash_bucket(nfi); struct fib_info *fi; hlist_for_each_entry(fi, head, fib_hash) { if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && nfi->fib_type == fi->fib_type && nfi->fib_tb_id == fi->fib_tb_id && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && nh_comp(fi, nfi) == 0) return fi; } return NULL; } /* Check, that the gateway is already configured. * Used only by redirect accept routine, under rcu_read_lock(); */ int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; struct fib_nh *nh; head = fib_nh_head(dev); hlist_for_each_entry_rcu(nh, head, nh_hash) { DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { return 0; } } return -1; } size_t fib_nlmsg_size(struct fib_info *fi) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_DST */ + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->nh) payload += nla_total_size(4); /* RTA_NH_ID */ if (nhs) { size_t nh_encapsize = 0; /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ for (i = 0; i < fib_info_num_path(fi); i++) { struct fib_nh_common *nhc = fib_info_nhc(fi, i); if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } } /* all nexthops are packed in a nested attribute */ payload += nla_total_size((nhs * nhsize) + nh_encapsize); } return payload; } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); if (!skb) goto errout; fri.fi = fa->fa_info; fri.tb_id = tb_id; fri.dst = key; fri.dst_len = dst_len; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, info->nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; if (likely(nhc->nhc_gw_family == AF_INET)) n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); else if (nhc->nhc_gw_family == AF_INET6) n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, nhc->nhc_dev); else n = NULL; if (n) { state = READ_ONCE(n->nud_state); neigh_release(n); } else { return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } return 1; } int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *encap, u16 encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { int err; nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, gfp_flags); if (!nhc->nhc_pcpu_rth_output) return -ENOMEM; if (encap) { struct lwtunnel_state *lwtstate; err = lwtunnel_build_state(net, encap_type, encap, nhc->nhc_family, cfg, &lwtstate, extack); if (err) goto lwt_failure; nhc->nhc_lwtstate = lwtstate_get(lwtstate); } return 0; lwt_failure: rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); nhc->nhc_pcpu_rth_output = NULL; return err; } EXPORT_SYMBOL_GPL(fib_nh_common_init); int fib_nh_init(struct net *net, struct fib_nh *nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack) { int err; nh->fib_nh_family = AF_INET; err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, GFP_KERNEL, extack); if (err) return err; nh->fib_nh_oif = cfg->fc_oif; nh->fib_nh_gw_family = cfg->fc_gw_family; if (cfg->fc_gw_family == AF_INET) nh->fib_nh_gw4 = cfg->fc_gw4; else if (cfg->fc_gw_family == AF_INET6) nh->fib_nh_gw6 = cfg->fc_gw6; nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) atomic_inc(&net->ipv4.fib_num_tclassid_users); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight = nh_weight; #endif return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, struct netlink_ext_ack *extack) { int nhs = 0; while (rtnh_ok(rtnh, remaining)) { nhs++; rtnh = rtnh_next(rtnh, &remaining); } /* leftover implies invalid nexthop configuration, discard it */ if (remaining > 0) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthops"); nhs = 0; } return nhs; } static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla, struct netlink_ext_ack *extack) { if (nla_len(nla) < sizeof(*gw)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY"); return -EINVAL; } *gw = nla_get_in_addr(nla); return 0; } /* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; struct fib_nh *nh; int ret; change_nexthops(fi) { int attrlen; memset(&fib_cfg, 0, sizeof(fib_cfg)); if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthop"); return -EINVAL; } if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); return -EINVAL; } fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; fib_cfg.fc_oif = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla, extack); if (ret) goto errout; if (fib_cfg.fc_gw4) fib_cfg.fc_gw_family = AF_INET; } else if (nlav) { ret = fib_gw_from_via(&fib_cfg, nlav, extack); if (ret) goto errout; } nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } fib_cfg.fc_flow = nla_get_u32(nla); } fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); /* RTA_ENCAP_TYPE length checked in * lwtunnel_valid_encap_type_attr */ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) fib_cfg.fc_encap_type = nla_get_u16(nla); } ret = fib_nh_init(net, nexthop_nh, &fib_cfg, rtnh->rtnh_hops + 1, extack); if (ret) goto errout; rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); ret = -EINVAL; nh = fib_info_nh(fi, 0); if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; } #endif ret = 0; errout: return ret; } /* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; if (fib_info_num_path(fi) < 2) return; total = 0; for_nexthops(fi) { if (nh->fib_nh_flags & RTNH_F_DEAD) continue; if (ip_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN) continue; total += nh->fib_nh_weight; } endfor_nexthops(fi); w = 0; change_nexthops(fi) { int upper_bound; if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { upper_bound = -1; } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { upper_bound = -1; } else { w += nexthop_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; } atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); } endfor_nexthops(fi); } #else /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); return -EINVAL; } #define fib_rebalance(fi) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, const struct fib_config *cfg, struct netlink_ext_ack *extack) { struct lwtunnel_state *lwtstate; int ret, result = 0; if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; ret = lwtunnel_build_state(net, encap_type, encap, AF_INET, cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); lwtstate_free(lwtstate); } return result; } int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; #endif if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; if (cfg->fc_nh_id) { if (fi->nh && cfg->fc_nh_id == fi->nh->id) return 0; return 1; } if (fi->nh) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp) return 1; return 0; } if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && cfg->fc_flow != nh->nh_tclassid) return 1; #endif if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (!cfg->fc_mp) return 0; rtnh = cfg->fc_mp; remaining = cfg->fc_mp_len; for_nexthops(fi) { int attrlen; if (!rtnh_ok(rtnh, remaining)) return -EINVAL; if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) return 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); int err; nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { __be32 gw; err = fib_gw_from_attr(&gw, nla, extack); if (err) return err; if (nh->fib_nh_gw_family != AF_INET || gw != nh->fib_nh_gw4) return 1; } else if (nlav) { struct fib_config cfg2; err = fib_gw_from_via(&cfg2, nlav, extack); if (err) return err; switch (nh->fib_nh_gw_family) { case AF_INET: if (cfg2.fc_gw_family != AF_INET || cfg2.fc_gw4 != nh->fib_nh_gw4) return 1; break; case AF_INET6: if (cfg2.fc_gw_family != AF_INET6 || ipv6_addr_cmp(&cfg2.fc_gw6, &nh->fib_nh_gw6)) return 1; break; } } #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } if (nla_get_u32(nla) != nh->nh_tclassid) return 1; } #endif } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); #endif return 0; } bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) { struct nlattr *nla; int remaining; if (!cfg->fc_mx) return true; nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); u32 fi_val, val; if (!type) continue; if (type > RTAX_MAX) return false; type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) return false; val = nla_get_u32(nla); } fi_val = fi->fib_metrics->metrics[type - 1]; if (type == RTAX_FEATURES) fi_val &= ~DST_FEATURE_ECN_CA; if (fi_val != val) return false; } return true; } static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, u32 table, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = table, .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, .fc_ifindex = nh->fib_nh_oif, .fc_gateway = nh->fib_nh_gw6, }; struct fib6_nh fib6_nh = {}; int err; err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); if (!err) { nh->fib_nh_dev = fib6_nh.fib_nh_dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); nh->fib_nh_oif = nh->fib_nh_dev->ifindex; nh->fib_nh_scope = RT_SCOPE_LINK; ipv6_stub->fib6_nh_release(&fib6_nh); } return err; } /* * Picture * ------- * * Semantics of nexthop is very messy by historical reasons. * We have to take into account, that: * a) gateway can be actually local interface address, * so that gatewayed route is direct. * b) gateway must be on-link address, possibly * described not by an ifaddr, but also by a direct route. * c) If both gateway and interface are specified, they should not * contradict. * d) If we use tunnel routes, gateway could be not on-link. * * Attempt to reconcile all of these (alas, self-contradictory) conditions * results in pretty ugly and hairy code with obscure logic. * * I chose to generalized it instead, so that the size * of code does not increase practically, but it becomes * much more general. * Every prefix is assigned a "scope" value: "host" is local address, * "link" is direct route, * [ ... "site" ... "interior" ... ] * and "universe" is true gateway route with global meaning. * * Every prefix refers to a set of "nexthop"s (gw, oif), * where gw must have narrower scope. This recursion stops * when gw has LOCAL scope or if "nexthop" is declared ONLINK, * which means that gw is forced to be on link. * * Code is still hairy, but now it is apparently logically * consistent and very flexible. F.e. as by-product it allows * to co-exists in peace independent exterior and interior * routing processes. * * Normally it looks as following. * * {universe prefix} -> (gw, oif) [scope link] * | * |-> {link prefix} -> (gw, oif) [scope local] * | * |-> {local prefix} (terminal node) */ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { struct net_device *dev; struct fib_result res; int err = 0; if (nh->fib_nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; if (scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); return -EINVAL; } dev = __dev_get_by_index(net, nh->fib_nh_oif); if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); return -ENODEV; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); return -ENETDOWN; } addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); if (addr_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); return -EINVAL; } if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; nh->fib_nh_dev = dev; netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_LINK; return 0; } rcu_read_lock(); { struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->fib_nh_gw4, .flowi4_scope = scope + 1, .flowi4_oif = nh->fib_nh_oif, .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; if (table && table != RT_TABLE_MAIN) tbl = fib_get_table(net, table); if (tbl) err = fib_table_lookup(tbl, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE | FIB_LOOKUP_NOREF); /* on error or if no table given do full lookup. This * is needed for example when nexthops are in the local * table rather than the given table */ if (!tbl || err) { err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE); } if (err) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } } err = -EINVAL; if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } nh->fib_nh_scope = res.scope; nh->fib_nh_oif = FIB_RES_OIF(res); nh->fib_nh_dev = dev = FIB_RES_DEV(res); if (!dev) { NL_SET_ERR_MSG(extack, "No egress device for nexthop gateway"); goto out; } netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; out: rcu_read_unlock(); return err; } static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, struct netlink_ext_ack *extack) { struct in_device *in_dev; int err; if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; } rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->fib_nh_oif); if (!in_dev) goto out; err = -ENETDOWN; if (!(in_dev->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; } nh->fib_nh_dev = in_dev->dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_HOST; if (!netif_carrier_ok(nh->fib_nh_dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = 0; out: rcu_read_unlock(); return err; } int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { int err; if (nh->fib_nh_gw_family == AF_INET) err = fib_check_nh_v4_gw(net, nh, table, scope, extack); else if (nh->fib_nh_gw_family == AF_INET6) err = fib_check_nh_v6_gw(net, nh, table, extack); else err = fib_check_nh_nongw(net, nh, extack); return err; } __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { struct fib_nh *nh; __be32 saddr; if (nhc->nhc_family != AF_INET) return inet_select_addr(nhc->nhc_dev, 0, scope); nh = container_of(nhc, struct fib_nh, nh_common); saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); WRITE_ONCE(nh->nh_saddr, saddr); WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid)); return saddr; } __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (READ_ONCE(nh->nh_saddr_genid) == atomic_read(&net->ipv4.dev_addr_genid)) return READ_ONCE(nh->nh_saddr); } return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { u32 tb_id = cfg->fc_table; int rc; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, tb_id); if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, RT_TABLE_LOCAL); } if (rc != RTN_LOCAL) return false; } return true; } struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack) { int err; struct fib_info *fi = NULL; struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; ASSERT_RTNL(); if (cfg->fc_type > RTN_MAX) goto err_inval; /* Fast check to catch the most weird cases */ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); goto err_inval; } if (cfg->fc_nh_id) { if (!cfg->fc_mx) { fi = fib_find_info_nh(net, cfg); if (fi) { refcount_inc(&fi->fib_treeref); return fi; } } nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } nhs = 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); if (nhs == 0) goto err_inval; } #endif fib_info_hash_grow(net); fi = kzalloc_flex(*fi, fib_nh, nhs); if (!fi) { err = -ENOBUFS; goto failure; } fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); } fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; if (nh) { if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -EINVAL; } else { err = 0; fi->nh = nh; } } else { change_nexthops(fi) { nexthop_nh->nh_parent = fi; } endfor_nexthops(fi) if (cfg->fc_mp) err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); else err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); } if (err != 0) goto failure; if (fib_props[cfg->fc_type].error) { if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; } goto link_it; } else { switch (cfg->fc_type) { case RTN_UNICAST: case RTN_LOCAL: case RTN_BROADCAST: case RTN_ANYCAST: case RTN_MULTICAST: break; default: NL_SET_ERR_MSG(extack, "Invalid route type"); goto err_inval; } } if (cfg->fc_scope > RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (fi->nh) { err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); if (err) goto failure; } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ if (nhs != 1) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); goto err_inval; } if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); } else { int linkdown = 0; change_nexthops(fi) { err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, cfg->fc_table, cfg->fc_scope, extack); if (err != 0) goto failure; if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) linkdown++; } endfor_nexthops(fi) if (linkdown == fi->fib_nhs) fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); goto err_inval; } if (!fi->nh) { change_nexthops(fi) { fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, fi->fib_scope); if (nexthop_nh->fib_nh_gw_family == AF_INET6) fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); } link_it: ofi = fib_find_info(fi); if (ofi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); refcount_inc(&ofi->fib_treeref); return ofi; } refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); net->ipv4.fib_info_cnt++; hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); if (fi->fib_prefsrc) { struct hlist_head *head; head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc); hlist_add_head(&fi->fib_lhash, head); } if (fi->nh) { list_add(&fi->nh_list, &nh->fi_list); } else { change_nexthops(fi) { struct hlist_head *head; if (!nexthop_nh->fib_nh_dev) continue; head = fib_nh_head(nexthop_nh->fib_nh_dev); hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } return fi; err_inval: err = -EINVAL; failure: if (fi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); } return ERR_PTR(err); } int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, u8 rt_family, unsigned char *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; if (nhc->nhc_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; rcu_read_lock(); switch (nhc->nhc_family) { case AF_INET: if (ip_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; case AF_INET6: if (ip6_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; } rcu_read_unlock(); } switch (nhc->nhc_gw_family) { case AF_INET: if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) goto nla_put_failure; break; case AF_INET6: /* if gateway family does not match nexthop family * gateway is encoded as RTA_VIA */ if (rt_family != nhc->nhc_gw_family) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); } else if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0) { goto nla_put_failure; } break; } *flags |= (nhc->nhc_flags & (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP)); if (!skip_oif && nhc->nhc_dev && nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) goto nla_put_failure; if (lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; unsigned char flags = 0; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_hops = nh_weight - 1; rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0) goto nla_put_failure; rtnh->rtnh_flags = flags; if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) goto nla_put_failure; /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_add_nexthop); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (unlikely(fi->nh)) { if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0) goto nla_put_failure; goto mp_end; } for_nexthops(fi) { u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID nh_tclassid = nh->nh_tclassid; #endif if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, AF_INET, nh_tclassid) < 0) goto nla_put_failure; } endfor_nexthops(fi); mp_end: nla_nest_end(skb, mp); return 0; nla_put_failure: return -EMSGSIZE; } #else static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { return 0; } #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags) { unsigned int nhs = fib_info_num_path(fri->fi); struct fib_info *fi = fri->fi; u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp); if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->nh) { if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode)) goto offload; } if (nhs == 1) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (nh->nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; } #endif } else { if (fib_add_multipath(skb, fi) < 0) goto nla_put_failure; } offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) rtm->rtm_flags |= RTM_F_TRAP; if (fri->offload_failed) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* * Update FIB if: * - local address disappeared -> we must delete all the entries * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ int fib_sync_down_addr(struct net_device *dev, __be32 local) { int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); struct hlist_head *head; struct fib_info *fi; int ret = 0; if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); hlist_for_each_entry(fi, head, fib_lhash) { if (!net_eq(fi->fib_net, net) || fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; fi->pfsrc_removed = true; ret++; } } return ret; } static int call_fib_nh_notifiers(struct fib_nh *nh, enum fib_event_type event_type) { bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); struct fib_nh_notifier_info info = { .fib_nh = nh, }; switch (event_type) { case FIB_EVENT_NH_ADD: if (nh->fib_nh_flags & RTNH_F_DEAD) break; if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) break; return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || (nh->fib_nh_flags & RTNH_F_DEAD)) return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); break; default: break; } return NOTIFY_DONE; } /* Update the PMTU of exceptions when: * - the new MTU of the first hop becomes smaller than the PMTU * - the old MTU was the same as the PMTU, and it limited discovery of * larger MTUs on the path. With that limit raised, we can now * discover larger MTUs * A special case is locked exceptions, for which the PMTU is smaller * than the minimal accepted PMTU: * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!bucket) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); fnhe; fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { if (fnhe->fnhe_mtu_locked) { if (new <= fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; fnhe->fnhe_mtu_locked = false; } } else if (new < fnhe->fnhe_pmtu || orig == fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; } } } } void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { struct hlist_head *head = fib_nh_head(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } /* Event force Flags Description * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed * * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { struct hlist_head *head = fib_nh_head(dev); struct fib_info *prev_fi = NULL; int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; int ret = 0; if (force) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int dead; BUG_ON(!fi->fib_nhs); DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (fi == prev_fi) continue; prev_fi = fi; dead = 0; change_nexthops(fi) { if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) dead++; else if (nexthop_nh->fib_nh_dev == dev && nexthop_nh->fib_nh_scope != scope) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; break; } call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_DEL); dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (event == NETDEV_UNREGISTER && nexthop_nh->fib_nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: fi->fib_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: fi->fib_flags |= RTNH_F_LINKDOWN; break; } ret++; } fib_rebalance(fi); } return ret; } /* Must be invoked inside of an RCU protected region. */ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; u8 slen = 32 - res->prefixlen; int order = -1, last_idx = -1; struct fib_alias *fa, *fa1 = NULL; u32 last_prio = res->fi->fib_priority; dscp_t last_dscp = 0; hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; struct fib_nh_common *nhc; if (fa->fa_slen != slen) continue; if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; if (fa->tb_id != tb->tb_id) continue; if (next_fi->fib_priority > last_prio && fa->fa_dscp == last_dscp) { if (last_dscp) continue; break; } if (next_fi->fib_flags & RTNH_F_DEAD) continue; last_dscp = fa->fa_dscp; last_prio = next_fi->fib_priority; if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; nhc = fib_info_nhc(next_fi, 0); if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); if (!fi) { if (next_fi != res->fi) break; fa1 = fa; } else if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } fi = next_fi; order++; } if (order <= 0 || !fi) { if (fa1) fa1->fa_default = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } if (last_idx >= 0) fib_result_assign(res, last_resort); fa1->fa_default = last_idx; out: return; } /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. * * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; struct hlist_head *head; struct fib_nh *nh; int ret; if (!(dev->flags & IFF_UP)) return 0; if (nh_flags & RTNH_F_DEAD) { unsigned int flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) nh_flags |= RTNH_F_LINKDOWN; } prev_fi = NULL; head = fib_nh_head(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int alive; BUG_ON(!fi->fib_nhs); DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (fi == prev_fi) continue; prev_fi = fi; alive = 0; change_nexthops(fi) { if (!(nexthop_nh->fib_nh_flags & nh_flags)) { alive++; continue; } if (!nexthop_nh->fib_nh_dev || !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->fib_nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; alive++; nexthop_nh->fib_nh_flags &= ~nh_flags; call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } fib_rebalance(fi); } return ret; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static bool fib_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; if (nh->fib_nh_scope == RT_SCOPE_LINK) { struct neighbour *n; rcu_read_lock(); if (likely(nh->fib_nh_gw_family == AF_INET)) n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); else if (nh->fib_nh_gw_family == AF_INET6) n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); else n = NULL; if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); } return !!(state & NUD_VALID); } void fib_select_multipath(struct fib_result *res, int hash, const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; bool use_neigh; int score = -1; __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); saddr = fl4 ? fl4->saddr : 0; change_nexthops(fi) { int nh_upper_bound, nh_score = 0; /* Nexthops without a carrier are assigned an upper bound of * minus one when "ignore_routes_with_linkdown" is set. */ nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound); if (nh_upper_bound == -1 || (use_neigh && !fib_good_nh(nexthop_nh))) continue; if (saddr && nexthop_nh->nh_saddr == saddr) nh_score += 2; if (hash <= nh_upper_bound) nh_score++; if (score < nh_score) { res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; if (nh_score == 3 || (!saddr && nh_score == 1)) return; score = nh_score; } } endfor_nexthops(fi); } #endif void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { if (fl4->flowi4_oif) goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h, fl4); } else #endif if (!res->prefixlen && res->table->tb_num_default > 1 && res->type == RTN_UNICAST) fib_select_default(fl4, res); check_saddr: if (!fl4->saddr) { struct net_device *l3mdev; l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev); if (!l3mdev || l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev) fl4->saddr = fib_result_prefsrc(net, res); else fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } int __net_init fib4_semantics_init(struct net *net) { unsigned int hash_bits = 4; net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); if (!net->ipv4.fib_info_hash) return -ENOMEM; net->ipv4.fib_info_hash_bits = hash_bits; net->ipv4.fib_info_cnt = 0; return 0; } void __net_exit fib4_semantics_exit(struct net *net) { fib_info_hash_free(net->ipv4.fib_info_hash); } |
| 2893 2898 2899 2902 3098 3101 3108 44 44 30 30 3703 3709 3709 3687 40 40 40 40 40 30 30 30 27 30 30 30 30 41 41 27 3709 27 27 3688 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 | // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_read_value); /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc_obj(*qos); if (!qos) return -ENOMEM; n = kzalloc_objs(*n, 3); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc_obj(*req); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc_obj(*req); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc_obj(*req); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance); |
| 75 38 37 73 2 9 62 3 64 64 11 10 10 10 10 10 10 10 10 11 2 10 9 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/in6.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { struct sockaddr_in6 udp6_addr = {}; int err; struct socket *sock = NULL; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->ipv6_v6only) { err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { memset(&udp6_addr, 0, sizeof(udp6_addr)); udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) goto error; udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck, u16 ip6cb_flags) { struct udphdr *uh; struct ipv6hdr *ip6h; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); skb_dst_set(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6tunnel_xmit(sk, skb, dev, ip6cb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); /** * udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel * @skb: Packet for which lookup is done * @dev: Tunnel device * @net: Network namespace of tunnel device * @sock: Socket which provides route info * @oif: Index of the output interface * @saddr: Memory to store the src ip address * @key: Tunnel information * @sport: UDP source port * @dport: UDP destination port * @dsfield: The traffic class field * @dst_cache: The dst cache to use for lookup * This function performs a route lookup on a UDP tunnel * * It returns a valid dst pointer and stores src address to be used in * tunnel in param saddr on success, else a pointer encoded error code. */ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, int oif, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, struct dst_cache *dst_cache) { struct dst_entry *dst = NULL; struct flowi6 fl6; #ifdef CONFIG_DST_CACHE if (dst_cache) { dst = dst_cache_get_ip6(dst_cache, saddr); if (dst) return dst; } #endif memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = IPPROTO_UDP; fl6.flowi6_oif = oif; fl6.daddr = key->u.ipv6.dst; fl6.saddr = key->u.ipv6.src; fl6.fl6_sport = sport; fl6.fl6_dport = dport; fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, NULL); if (IS_ERR(dst)) { netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); return ERR_PTR(-ENETUNREACH); } if (dst_dev(dst) == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); dst_release(dst); return ERR_PTR(-ELOOP); } #ifdef CONFIG_DST_CACHE if (dst_cache) dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); #endif *saddr = fl6.saddr; return dst; } EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup); MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL"); |
| 1 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/rose.h> static void rose_heartbeat_expiry(struct timer_list *t); static void rose_timer_expiry(struct timer_list *); static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); sk->sk_timer.function = rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires); } void rose_start_t1timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t2timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t3timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_hbtimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_idletimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->idletimer); if (rose->idle > 0) { rose->idletimer.function = rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires); } } void rose_stop_heartbeat(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } void rose_stop_timer(struct sock *sk) { sk_stop_timer(sk, &rose_sk(sk)->timer); } void rose_stop_idletimer(struct sock *sk) { sk_stop_timer(sk, &rose_sk(sk)->idletimer); } static void rose_heartbeat_expiry(struct timer_list *t) { struct sock *sk = timer_container_of(sk, t, sk_timer); struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ/20); goto out; } switch (rose->state) { case ROSE_STATE_0: /* Magic here: If we listen() and a new link dies before it is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { bh_unlock_sock(sk); rose_destroy_socket(sk); sock_put(sk); return; } break; case ROSE_STATE_3: /* * Check for the state of the receive buffer. */ if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && (rose->condition & ROSE_COND_OWN_RX_BUSY)) { rose->condition &= ~ROSE_COND_OWN_RX_BUSY; rose->condition &= ~ROSE_COND_ACK_PENDING; rose->vl = rose->vr; rose_write_internal(sk, ROSE_RR); rose_stop_timer(sk); /* HB */ break; } break; } rose_start_heartbeat(sk); out: bh_unlock_sock(sk); sock_put(sk); } static void rose_timer_expiry(struct timer_list *t) { struct rose_sock *rose = timer_container_of(rose, t, timer); struct sock *sk = &rose->sock; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &rose->timer, jiffies + HZ/20); goto out; } switch (rose->state) { case ROSE_STATE_1: /* T1 */ case ROSE_STATE_4: /* T2 */ rose_write_internal(sk, ROSE_CLEAR_REQUEST); rose->state = ROSE_STATE_2; rose_start_t3timer(sk); break; case ROSE_STATE_2: /* T3 */ rose_neigh_put(rose->neighbour); rose_disconnect(sk, ETIMEDOUT, -1, -1); break; case ROSE_STATE_3: /* HB */ if (rose->condition & ROSE_COND_ACK_PENDING) { rose->condition &= ~ROSE_COND_ACK_PENDING; rose_enquiry_response(sk); } break; } out: bh_unlock_sock(sk); sock_put(sk); } static void rose_idletimer_expiry(struct timer_list *t) { struct rose_sock *rose = timer_container_of(rose, t, idletimer); struct sock *sk = &rose->sock; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &rose->idletimer, jiffies + HZ/20); goto out; } rose_clear_queues(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); rose_sk(sk)->state = ROSE_STATE_2; rose_start_t3timer(sk); sk->sk_state = TCP_CLOSE; sk->sk_err = 0; sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } out: bh_unlock_sock(sk); sock_put(sk); } |
| 6 6 6 6 1 5 6 31 31 9 9 8 8 8 26 10 10 16 15 2 16 16 16 16 16 7 7 7 9 9 9 8 8 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "queueing.h" #include "socket.h" #include "timers.h" #include "device.h" #include "ratelimiter.h" #include "peer.h" #include "messages.h" #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> #include <linux/suspend.h> #include <net/dst_metadata.h> #include <net/gso.h> #include <net/icmp.h> #include <net/rtnetlink.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> static LIST_HEAD(device_list); static int wg_open(struct net_device *dev) { struct in_device *dev_v4 = __in_dev_get_rtnl(dev); struct inet6_dev *dev_v6 = __in6_dev_get(dev); struct wg_device *wg = netdev_priv(dev); struct wg_peer *peer; int ret; if (dev_v4) { /* At some point we might put this check near the ip_rt_send_ * redirect call of ip_forward in net/ipv4/ip_forward.c, similar * to the current secpath check. */ IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false); IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false; } if (dev_v6) dev_v6->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_NONE; mutex_lock(&wg->device_update_lock); ret = wg_socket_init(wg, wg->incoming_port); if (ret < 0) goto out; list_for_each_entry(peer, &wg->peer_list, peer_list) { wg_packet_send_staged_packets(peer); if (peer->persistent_keepalive_interval) wg_packet_send_keepalive(peer); } out: mutex_unlock(&wg->device_update_lock); return ret; } static int wg_pm_notification(struct notifier_block *nb, unsigned long action, void *data) { struct wg_device *wg; struct wg_peer *peer; /* If the machine is constantly suspending and resuming, as part of * its normal operation rather than as a somewhat rare event, then we * don't actually want to clear keys. */ if (IS_ENABLED(CONFIG_PM_AUTOSLEEP) || IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)) return 0; if (action != PM_HIBERNATION_PREPARE && action != PM_SUSPEND_PREPARE) return 0; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) { timer_delete(&peer->timer_zero_key_material); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); } mutex_unlock(&wg->device_update_lock); } rtnl_unlock(); rcu_barrier(); return 0; } static struct notifier_block pm_notifier = { .notifier_call = wg_pm_notification }; static int wg_vm_notification(struct notifier_block *nb, unsigned long action, void *data) { struct wg_device *wg; struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_noise_expire_current_peer_keypairs(peer); mutex_unlock(&wg->device_update_lock); } rtnl_unlock(); return 0; } static struct notifier_block vm_notifier = { .notifier_call = wg_vm_notification }; static int wg_stop(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); struct wg_peer *peer; struct sk_buff *skb; mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) { wg_packet_purge_staged_packets(peer); wg_timers_stop(peer); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); } mutex_unlock(&wg->device_update_lock); while ((skb = ptr_ring_consume(&wg->handshake_queue.ring)) != NULL) kfree_skb(skb); atomic_set(&wg->handshake_queue_len, 0); wg_socket_reinit(wg, NULL, NULL); return 0; } static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); struct sk_buff_head packets; struct wg_peer *peer; struct sk_buff *next; sa_family_t family; u32 mtu; int ret; if (unlikely(!wg_check_packet_protocol(skb))) { ret = -EPROTONOSUPPORT; net_dbg_ratelimited("%s: Invalid IP packet\n", dev->name); goto err; } peer = wg_allowedips_lookup_dst(&wg->peer_allowedips, skb); if (unlikely(!peer)) { ret = -ENOKEY; if (skb->protocol == htons(ETH_P_IP)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI4\n", dev->name, &ip_hdr(skb)->daddr); else if (skb->protocol == htons(ETH_P_IPV6)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n", dev->name, &ipv6_hdr(skb)->daddr); goto err_icmp; } family = READ_ONCE(peer->endpoint.addr.sa_family); if (unlikely(family != AF_INET && family != AF_INET6)) { ret = -EDESTADDRREQ; net_dbg_ratelimited("%s: No valid endpoint has been configured or discovered for peer %llu\n", dev->name, peer->internal_id); goto err_peer; } mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; __skb_queue_head_init(&packets); if (!skb_is_gso(skb)) { skb_mark_not_on_list(skb); } else { struct sk_buff *segs = skb_gso_segment(skb, 0); if (IS_ERR(segs)) { ret = PTR_ERR(segs); goto err_peer; } dev_kfree_skb(skb); skb = segs; } skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) continue; /* We only need to keep the original dst around for icmp, * so at this point we're in a position to drop it. */ skb_dst_drop(skb); PACKET_CB(skb)->mtu = mtu; __skb_queue_tail(&packets, skb); } spin_lock_bh(&peer->staged_packet_queue.lock); /* If the queue is getting too big, we start removing the oldest packets * until it's small again. We do this before adding the new packet, so * we don't remove GSO segments that are in excess. */ while (skb_queue_len(&peer->staged_packet_queue) > MAX_STAGED_PACKETS) { dev_kfree_skb(__skb_dequeue(&peer->staged_packet_queue)); DEV_STATS_INC(dev, tx_dropped); } skb_queue_splice_tail(&packets, &peer->staged_packet_queue); spin_unlock_bh(&peer->staged_packet_queue.lock); wg_packet_send_staged_packets(peer); wg_peer_put(peer); return NETDEV_TX_OK; err_peer: wg_peer_put(peer); err_icmp: if (skb->protocol == htons(ETH_P_IP)) icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return ret; } static const struct net_device_ops netdev_ops = { .ndo_open = wg_open, .ndo_stop = wg_stop, .ndo_start_xmit = wg_xmit, }; static void wg_destruct(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); rtnl_lock(); list_del(&wg->device_list); rtnl_unlock(); mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg->incoming_port = 0; wg_socket_reinit(wg, NULL, NULL); /* The final references are cleared in the below calls to destroy_workqueue. */ wg_peer_remove_all(wg); destroy_workqueue(wg->handshake_receive_wq); destroy_workqueue(wg->handshake_send_wq); destroy_workqueue(wg->packet_crypt_wq); wg_packet_queue_free(&wg->handshake_queue, true); wg_packet_queue_free(&wg->decrypt_queue, false); wg_packet_queue_free(&wg->encrypt_queue, false); rcu_barrier(); /* Wait for all the peers to be actually freed. */ wg_ratelimiter_uninit(); memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); kvfree(wg->index_hashtable); kvfree(wg->peer_hashtable); mutex_unlock(&wg->device_update_lock); pr_debug("%s: Interface destroyed\n", dev->name); free_netdev(dev); } static const struct device_type device_type = { .name = KBUILD_MODNAME }; static void wg_setup(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO | NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA }; const int overhead = MESSAGE_MINIMUM_LENGTH + sizeof(struct udphdr) + max(sizeof(struct ipv6hdr), sizeof(struct iphdr)); dev->netdev_ops = &netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->hard_header_len = 0; dev->addr_len = 0; dev->needed_headroom = DATA_PACKET_HEAD_ROOM; dev->needed_tailroom = noise_encrypted_len(MESSAGE_PADDING_MULTIPLE); dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->features |= WG_NETDEV_FEATURES; dev->hw_features |= WG_NETDEV_FEATURES; dev->hw_enc_features |= WG_NETDEV_FEATURES; dev->mtu = ETH_DATA_LEN - overhead; dev->max_mtu = round_down(INT_MAX, MESSAGE_PADDING_MULTIPLE) - overhead; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; SET_NETDEV_DEVTYPE(dev, &device_type); /* We need to keep the dst around in case of icmp replies. */ netif_keep_dst(dev); netif_set_tso_max_size(dev, GSO_MAX_SIZE); wg->dev = dev; } static int wg_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct wg_device *wg = netdev_priv(dev); int ret = -ENOMEM; rcu_assign_pointer(wg->creating_net, link_net); init_rwsem(&wg->static_identity.lock); mutex_init(&wg->socket_update_lock); mutex_init(&wg->device_update_lock); wg_allowedips_init(&wg->peer_allowedips); wg_cookie_checker_init(&wg->cookie_checker, wg); INIT_LIST_HEAD(&wg->peer_list); wg->device_update_gen = 1; wg->peer_hashtable = wg_pubkey_hashtable_alloc(); if (!wg->peer_hashtable) return ret; wg->index_hashtable = wg_index_hashtable_alloc(); if (!wg->index_hashtable) goto err_free_peer_hashtable; wg->handshake_receive_wq = alloc_workqueue("wg-kex-%s", WQ_CPU_INTENSIVE | WQ_FREEZABLE | WQ_PERCPU, 0, dev->name); if (!wg->handshake_receive_wq) goto err_free_index_hashtable; wg->handshake_send_wq = alloc_workqueue("wg-kex-%s", WQ_UNBOUND | WQ_FREEZABLE, 0, dev->name); if (!wg->handshake_send_wq) goto err_destroy_handshake_receive; wg->packet_crypt_wq = alloc_workqueue("wg-crypt-%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_PERCPU, 0, dev->name); if (!wg->packet_crypt_wq) goto err_destroy_handshake_send; ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, MAX_QUEUED_PACKETS); if (ret < 0) goto err_destroy_packet_crypt; ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, MAX_QUEUED_PACKETS); if (ret < 0) goto err_free_encrypt_queue; ret = wg_packet_queue_init(&wg->handshake_queue, wg_packet_handshake_receive_worker, MAX_QUEUED_INCOMING_HANDSHAKES); if (ret < 0) goto err_free_decrypt_queue; ret = wg_ratelimiter_init(); if (ret < 0) goto err_free_handshake_queue; netif_threaded_enable(dev); ret = register_netdevice(dev); if (ret < 0) goto err_uninit_ratelimiter; list_add(&wg->device_list, &device_list); /* We wait until the end to assign priv_destructor, so that * register_netdevice doesn't call it for us if it fails. */ dev->priv_destructor = wg_destruct; pr_debug("%s: Interface created\n", dev->name); return ret; err_uninit_ratelimiter: wg_ratelimiter_uninit(); err_free_handshake_queue: wg_packet_queue_free(&wg->handshake_queue, false); err_free_decrypt_queue: wg_packet_queue_free(&wg->decrypt_queue, false); err_free_encrypt_queue: wg_packet_queue_free(&wg->encrypt_queue, false); err_destroy_packet_crypt: destroy_workqueue(wg->packet_crypt_wq); err_destroy_handshake_send: destroy_workqueue(wg->handshake_send_wq); err_destroy_handshake_receive: destroy_workqueue(wg->handshake_receive_wq); err_free_index_hashtable: kvfree(wg->index_hashtable); err_free_peer_hashtable: kvfree(wg->peer_hashtable); return ret; } static struct rtnl_link_ops link_ops __read_mostly = { .kind = KBUILD_MODNAME, .priv_size = sizeof(struct wg_device), .setup = wg_setup, .newlink = wg_newlink, }; static void wg_netns_pre_exit(struct net *net) { struct wg_device *wg; struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { if (rcu_access_pointer(wg->creating_net) == net) { pr_debug("%s: Creating namespace exiting\n", wg->dev->name); netif_carrier_off(wg->dev); mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg_socket_reinit(wg, NULL, NULL); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); mutex_unlock(&wg->device_update_lock); } } rtnl_unlock(); } static struct pernet_operations pernet_ops = { .pre_exit = wg_netns_pre_exit }; int __init wg_device_init(void) { int ret; ret = register_pm_notifier(&pm_notifier); if (ret) return ret; ret = register_random_vmfork_notifier(&vm_notifier); if (ret) goto error_pm; ret = register_pernet_device(&pernet_ops); if (ret) goto error_vm; ret = rtnl_link_register(&link_ops); if (ret) goto error_pernet; return 0; error_pernet: unregister_pernet_device(&pernet_ops); error_vm: unregister_random_vmfork_notifier(&vm_notifier); error_pm: unregister_pm_notifier(&pm_notifier); return ret; } void wg_device_uninit(void) { rtnl_link_unregister(&link_ops); unregister_pernet_device(&pernet_ops); unregister_random_vmfork_notifier(&vm_notifier); unregister_pm_notifier(&pm_notifier); rcu_barrier(); } |
| 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_RT_H #define _LINUX_SCHED_RT_H #include <linux/sched.h> struct task_struct; static inline bool rt_prio(int prio) { return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); } static inline bool rt_or_dl_prio(int prio) { return unlikely(prio < MAX_RT_PRIO); } /* * Returns true if a task has a priority that belongs to RT class. PI-boosted * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. */ static inline bool rt_task(struct task_struct *p) { return rt_prio(p->prio); } /* * Returns true if a task has a priority that belongs to RT or DL classes. * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore * PI-boosted tasks. */ static inline bool rt_or_dl_task(struct task_struct *p) { return rt_or_dl_prio(p->prio); } /* * Returns true if a task has a policy that belongs to RT or DL classes. * PI-boosted tasks will return false. */ static inline bool rt_or_dl_task_policy(struct task_struct *tsk) { int policy = tsk->policy; if (policy == SCHED_FIFO || policy == SCHED_RR) return true; if (policy == SCHED_DEADLINE) return true; return false; } #ifdef CONFIG_RT_MUTEXES extern void rt_mutex_pre_schedule(void); extern void rt_mutex_schedule(void); extern void rt_mutex_post_schedule(void); /* * Must hold either p->pi_lock or task_rq(p)->lock. */ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) { return p->pi_top_task; } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) #endif extern void normalize_rt_tasks(void); /* * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ #define RR_TIMESLICE (100 * HZ / 1000) #endif /* _LINUX_SCHED_RT_H */ |
| 2 10 251 6 69 8 76 20 65 6 1 68 204 205 205 67 6 74 57 165 27 58 12 56 2 258 82 78 7 3 2 70 62 8 70 1 264 1 13 5 6 7 3 2 3 1 3 3 8 903 9 13 48 930 6917 907 4278 6907 6943 7 7 6 6 7 213 2 3 1 1 3 205 15 39 70 9 61 16 54 70 39 31 70 258 8 256 255 10 245 6 249 182 3 1 6 69 39 49 1 65 9 56 65 9 56 6 59 193 5 189 903 899 4 901 1 1279 887 148 146 4 3 140 143 11 136 131 17 17 2 11 134 5 3 2 7 1 1 7 142 1 2 1 2 2 3 3 3 1 2 1 6 3 3 2 1 2 3 2 1 5 6 1688 12 180 1 78 13 1445 9 8 4 143 142 2 6 11 12 4431 4432 1938 1749 913 25 819 963 906 7 4 6 6934 1 7 6939 6944 6953 9 8 1 9 9 5 4 11 5 5 5 1 3 4 11 3 3 3 23 1 7 14 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 1993 by Theodore Ts'o. */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/suspend.h> #include <linux/freezer.h> #include <linux/mutex.h> #include <linux/writeback.h> #include <linux/completion.h> #include <linux/highmem.h> #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> #include <linux/falloc.h> #include <linux/uio.h> #include <linux/ioprio.h> #include <linux/blk-cgroup.h> #include <linux/sched/mm.h> #include <linux/statfs.h> #include <linux/uaccess.h> #include <linux/blk-mq.h> #include <linux/spinlock.h> #include <uapi/linux/loop.h> /* Possible states of device */ enum { Lo_unbound, Lo_bound, Lo_rundown, Lo_deleting, }; struct loop_device { int lo_number; loff_t lo_offset; loff_t lo_sizelimit; int lo_flags; char lo_file_name[LO_NAME_SIZE]; struct file *lo_backing_file; unsigned int lo_min_dio_size; struct block_device *lo_device; gfp_t old_gfp_mask; spinlock_t lo_lock; int lo_state; spinlock_t lo_work_lock; struct workqueue_struct *workqueue; struct work_struct rootcg_work; struct list_head rootcg_cmd_list; struct list_head idle_worker_list; struct rb_root worker_tree; struct timer_list timer; bool sysfs_inited; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; struct mutex lo_mutex; bool idr_visible; }; struct loop_cmd { struct list_head list_entry; bool use_aio; /* use AIO interface to handle I/O */ atomic_t ref; /* only for aio */ long ret; struct kiocb iocb; struct bio_vec *bvec; struct cgroup_subsys_state *blkcg_css; struct cgroup_subsys_state *memcg_css; }; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH 128 static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); /** * loop_global_lock_killable() - take locks for safe loop_validate_file() test * * @lo: struct loop_device * @global: true if @lo is about to bind another "struct loop_device", false otherwise * * Returns 0 on success, -EINTR otherwise. * * Since loop_validate_file() traverses on other "struct loop_device" if * is_loop_device() is true, we need a global lock for serializing concurrent * loop_configure()/loop_change_fd()/__loop_clr_fd() calls. */ static int loop_global_lock_killable(struct loop_device *lo, bool global) { int err; if (global) { err = mutex_lock_killable(&loop_validate_mutex); if (err) return err; } err = mutex_lock_killable(&lo->lo_mutex); if (err && global) mutex_unlock(&loop_validate_mutex); return err; } /** * loop_global_unlock() - release locks taken by loop_global_lock_killable() * * @lo: struct loop_device * @global: true if @lo was about to bind another "struct loop_device", false otherwise */ static void loop_global_unlock(struct loop_device *lo, bool global) { mutex_unlock(&lo->lo_mutex); if (global) mutex_unlock(&loop_validate_mutex); } static int max_part; static int part_shift; static loff_t lo_calculate_size(struct loop_device *lo, struct file *file) { loff_t loopsize; int ret; if (S_ISBLK(file_inode(file)->i_mode)) { loopsize = i_size_read(file->f_mapping->host); } else { struct kstat stat; /* * Get the accurate file size. This provides better results than * cached inode data, particularly for network filesystems where * metadata may be stale. */ ret = vfs_getattr_nosec(&file->f_path, &stat, STATX_SIZE, 0); if (ret) return 0; loopsize = stat.size; } if (lo->lo_offset > 0) loopsize -= lo->lo_offset; /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) loopsize = lo->lo_sizelimit; /* * Unfortunately, if we want to do I/O on the device, * the number of 512-byte sectors has to fit into a sector_t. */ return loopsize >> 9; } /* * We support direct I/O only if lo_offset is aligned with the logical I/O size * of backing device, and the logical block size of loop is bigger than that of * the backing device. */ static bool lo_can_use_dio(struct loop_device *lo) { if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) return false; if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size) return false; if (lo->lo_offset & (lo->lo_min_dio_size - 1)) return false; return true; } /* * Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by * passing in the LO_FLAGS_DIRECT_IO flag from userspace. It will be silently * disabled when the device block size is too small or the offset is unaligned. * * loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and * not the originally passed in one. */ static inline void loop_update_dio(struct loop_device *lo) { lockdep_assert_held(&lo->lo_mutex); WARN_ON_ONCE(lo->lo_state == Lo_bound && lo->lo_queue->mq_freeze_depth == 0); if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; } /** * loop_set_size() - sets device size and notifies userspace * @lo: struct loop_device to set the size for * @size: new size of the loop device * * Callers must validate that the size passed into this function fits into * a sector_t, eg using loop_validate_size() */ static void loop_set_size(struct loop_device *lo, loff_t size) { if (!set_capacity_and_notify(lo->lo_disk, size)) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } static void loop_clear_limits(struct loop_device *lo, int mode) { struct queue_limits lim = queue_limits_start_update(lo->lo_queue); if (mode & FALLOC_FL_ZERO_RANGE) lim.max_write_zeroes_sectors = 0; if (mode & FALLOC_FL_PUNCH_HOLE) { lim.max_hw_discard_sectors = 0; lim.discard_granularity = 0; } /* * XXX: this updates the queue limits without freezing the queue, which * is against the locking protocol and dangerous. But we can't just * freeze the queue as we're inside the ->queue_rq method here. So this * should move out into a workqueue unless we get the file operations to * advertise if they support specific fallocate operations. */ queue_limits_commit_update(lo->lo_queue, &lim); } static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, int mode) { /* * We use fallocate to manipulate the space mappings used by the image * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; int ret; mode |= FALLOC_FL_KEEP_SIZE; if (!bdev_max_discard_sectors(lo->lo_device)) return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) return -EIO; /* * We initially configure the limits in a hope that fallocate is * supported and clear them here if that turns out not to be true. */ if (unlikely(ret == -EOPNOTSUPP)) loop_clear_limits(lo, mode); return ret; } static int lo_req_flush(struct loop_device *lo, struct request *rq) { int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; return ret; } static void lo_complete_rq(struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_status_t ret = BLK_STS_OK; if (cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || req_op(rq) != REQ_OP_READ) { if (cmd->ret < 0) ret = errno_to_blk_status(cmd->ret); goto end_io; } /* * Short READ - if we got some data, advance our request and * retry it. If we got no data, end the rest with EIO. */ if (cmd->ret) { blk_update_request(rq, BLK_STS_OK, cmd->ret); cmd->ret = 0; blk_mq_requeue_request(rq, true); } else { struct bio *bio = rq->bio; while (bio) { zero_fill_bio(bio); bio = bio->bi_next; } ret = BLK_STS_IOERR; end_io: blk_mq_end_request(rq, ret); } } static void lo_rw_aio_do_completion(struct loop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); if (!atomic_dec_and_test(&cmd->ref)) return; kfree(cmd->bvec); cmd->bvec = NULL; if (req_op(rq) == REQ_OP_WRITE) kiocb_end_write(&cmd->iocb); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } static void lo_rw_aio_complete(struct kiocb *iocb, long ret) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, int rw) { struct iov_iter iter; struct req_iterator rq_iter; struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; struct bio_vec tmp; unsigned int offset; unsigned int nr_bvec; int ret; nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { bvec = kmalloc_objs(struct bio_vec, nr_bvec, GFP_NOIO); if (!bvec) return -EIO; cmd->bvec = bvec; /* * The bios of the request may be started from the middle of * the 'bvec' because of bio splitting, so we can't directly * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec * API will take care of all details for us. */ rq_for_each_bvec(tmp, rq, rq_iter) { *bvec = tmp; bvec++; } bvec = cmd->bvec; offset = 0; } else { /* * Same here, this bio may be started from the middle of the * 'bvec' because of bio splitting, so offset from the bvec * must be passed to iov iterator */ offset = bio->bi_iter.bi_bvec_done; bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); } atomic_set(&cmd->ref, 2); iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; cmd->iocb.ki_ioprio = req_get_ioprio(rq); if (cmd->use_aio) { cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; } else { cmd->iocb.ki_complete = NULL; cmd->iocb.ki_flags = 0; } if (rw == ITER_SOURCE) { kiocb_start_write(&cmd->iocb); ret = file->f_op->write_iter(&cmd->iocb, &iter); } else ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); return -EIOCBQUEUED; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); case REQ_OP_WRITE_ZEROES: /* * If the caller doesn't want deallocation, call zeroout to * write zeroes the range. Otherwise, punch them out. */ return lo_fallocate(lo, rq, pos, (rq->cmd_flags & REQ_NOUNMAP) ? FALLOC_FL_ZERO_RANGE : FALLOC_FL_PUNCH_HOLE); case REQ_OP_DISCARD: return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); case REQ_OP_READ: return lo_rw_aio(lo, cmd, pos, ITER_DEST); default: WARN_ON_ONCE(1); return -EIO; } } static void loop_reread_partitions(struct loop_device *lo) { int rc; mutex_lock(&lo->lo_disk->open_mutex); rc = bdev_disk_changed(lo->lo_disk, false); mutex_unlock(&lo->lo_disk->open_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); } static unsigned int loop_query_min_dio_size(struct loop_device *lo) { struct file *file = lo->lo_backing_file; struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev; struct kstat st; /* * Use the minimal dio alignment of the file system if provided. */ if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) && (st.result_mask & STATX_DIOALIGN)) return st.dio_offset_align; /* * In a perfect world this wouldn't be needed, but as of Linux 6.13 only * a handful of file systems support the STATX_DIOALIGN flag. */ if (sb_bdev) return bdev_logical_block_size(sb_bdev); return SECTOR_SIZE; } static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR; } static int loop_validate_file(struct file *file, struct block_device *bdev) { struct inode *inode = file->f_mapping->host; struct file *f = file; /* Avoid recursion */ while (is_loop_device(f)) { struct loop_device *l; lockdep_assert_held(&loop_validate_mutex); if (f->f_mapping->host->i_rdev == bdev->bd_dev) return -EBADF; l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; if (l->lo_state != Lo_bound) return -EINVAL; /* Order wrt setting lo->lo_backing_file in loop_configure(). */ rmb(); f = l->lo_backing_file; } if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -EINVAL; return 0; } static void loop_assign_backing_file(struct loop_device *lo, struct file *file) { lo->lo_backing_file = file; lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); mapping_set_gfp_mask(file->f_mapping, lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); if (lo->lo_backing_file->f_flags & O_DIRECT) lo->lo_flags |= LO_FLAGS_DIRECT_IO; lo->lo_min_dio_size = loop_query_min_dio_size(lo); } static int loop_check_backing_file(struct file *file) { if (!file->f_op->read_iter) return -EINVAL; if ((file->f_mode & FMODE_WRITE) && !file->f_op->write_iter) return -EINVAL; return 0; } /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up * the original file and in High Availability environments to switch to * an alternative location for the content in case of server meltdown. * This can only work if the loop device is used read-only, and if the * new backing store is the same size and type as the old backing store. */ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { struct file *file = fget(arg); struct file *old_file; unsigned int memflags; int error; bool partscan; bool is_loop; if (!file) return -EBADF; error = loop_check_backing_file(file); if (error) { fput(file); return error; } /* suppress uevents while reconfiguring the device */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); is_loop = is_loop_device(file); error = loop_global_lock_killable(lo, is_loop); if (error) goto out_putf; error = -ENXIO; if (lo->lo_state != Lo_bound) goto out_err; /* the loop device has to be read-only */ error = -EINVAL; if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) goto out_err; error = loop_validate_file(file, bdev); if (error) goto out_err; old_file = lo->lo_backing_file; error = -EINVAL; /* size of the new backing store needs to be the same */ if (lo_calculate_size(lo, file) != lo_calculate_size(lo, old_file)) goto out_err; /* * We might switch to direct I/O mode for the loop device, write back * all dirty data the page cache now that so that the individual I/O * operations don't have to do that. */ vfs_fsync(file, 0); /* and ... switch */ disk_force_media_change(lo->lo_disk); memflags = blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); loop_assign_backing_file(lo, file); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue, memflags); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; loop_global_unlock(lo, is_loop); /* * Flush loop_validate_file() before fput(), for l->lo_backing_file * might be pointing at old_file which might be the last reference. */ if (!is_loop) { mutex_lock(&loop_validate_mutex); mutex_unlock(&loop_validate_mutex); } /* * We must drop file reference outside of lo_mutex as dropping * the file ref can take open_mutex which creates circular locking * dependency. */ fput(old_file); dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); if (partscan) loop_reread_partitions(lo); error = 0; done: kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); return error; out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); goto done; } /* loop sysfs attributes */ static ssize_t loop_attr_show(struct device *dev, char *page, ssize_t (*callback)(struct loop_device *, char *)) { struct gendisk *disk = dev_to_disk(dev); struct loop_device *lo = disk->private_data; return callback(lo, page); } #define LOOP_ATTR_RO(_name) \ static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \ static ssize_t loop_attr_do_show_##_name(struct device *d, \ struct device_attribute *attr, char *b) \ { \ return loop_attr_show(d, b, loop_attr_##_name##_show); \ } \ static struct device_attribute loop_attr_##_name = \ __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL); static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) { ssize_t ret; char *p = NULL; spin_lock_irq(&lo->lo_lock); if (lo->lo_backing_file) p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); spin_unlock_irq(&lo->lo_lock); if (IS_ERR_OR_NULL(p)) ret = PTR_ERR(p); else { ret = strlen(p); memmove(buf, p, ret); buf[ret++] = '\n'; buf[ret] = 0; } return ret; } static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); LOOP_ATTR_RO(dio); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, &loop_attr_offset.attr, &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, &loop_attr_partscan.attr, &loop_attr_dio.attr, NULL, }; static struct attribute_group loop_attribute_group = { .name = "loop", .attrs= loop_attrs, }; static void loop_sysfs_init(struct loop_device *lo) { lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, &loop_attribute_group); } static void loop_sysfs_exit(struct loop_device *lo) { if (lo->sysfs_inited) sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, &loop_attribute_group); } static void loop_get_discard_config(struct loop_device *lo, u32 *granularity, u32 *max_discard_sectors) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct kstatfs sbuf; /* * If the backing device is a block device, mirror its zeroing * capability. Set the discard sectors to the block device's zeroing * capabilities because loop discards result in blkdev_issue_zeroout(), * not blkdev_issue_discard(). This maintains consistent behavior with * file-backed loop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); *max_discard_sectors = bdev_write_zeroes_sectors(bdev); *granularity = bdev_discard_granularity(bdev); /* * We use punch hole to reclaim the free space used by the * image a.k.a. discard. */ } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { *max_discard_sectors = UINT_MAX >> 9; *granularity = sbuf.f_bsize; } } struct loop_worker { struct rb_node rb_node; struct work_struct work; struct list_head cmd_list; struct list_head idle_list; struct loop_device *lo; struct cgroup_subsys_state *blkcg_css; unsigned long last_ran_at; }; static void loop_workfn(struct work_struct *work); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) { return !css || css == blkcg_root_css; } #else static inline int queue_on_root_worker(struct cgroup_subsys_state *css) { return !css; } #endif static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; spin_lock_irq(&lo->lo_work_lock); if (queue_on_root_worker(cmd->blkcg_css)) goto queue_work; node = &lo->worker_tree.rb_node; while (*node) { parent = *node; cur_worker = container_of(*node, struct loop_worker, rb_node); if (cur_worker->blkcg_css == cmd->blkcg_css) { worker = cur_worker; break; } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) { node = &(*node)->rb_left; } else { node = &(*node)->rb_right; } } if (worker) goto queue_work; worker = kzalloc_obj(struct loop_worker, GFP_NOWAIT); /* * In the event we cannot allocate a worker, just queue on the * rootcg worker and issue the I/O as the rootcg */ if (!worker) { cmd->blkcg_css = NULL; if (cmd->memcg_css) css_put(cmd->memcg_css); cmd->memcg_css = NULL; goto queue_work; } worker->blkcg_css = cmd->blkcg_css; css_get(worker->blkcg_css); INIT_WORK(&worker->work, loop_workfn); INIT_LIST_HEAD(&worker->cmd_list); INIT_LIST_HEAD(&worker->idle_list); worker->lo = lo; rb_link_node(&worker->rb_node, parent, node); rb_insert_color(&worker->rb_node, &lo->worker_tree); queue_work: if (worker) { /* * We need to remove from the idle list here while * holding the lock so that the idle timer doesn't * free the worker */ if (!list_empty(&worker->idle_list)) list_del_init(&worker->idle_list); work = &worker->work; cmd_list = &worker->cmd_list; } else { work = &lo->rootcg_work; cmd_list = &lo->rootcg_cmd_list; } list_add_tail(&cmd->list_entry, cmd_list); queue_work(lo->workqueue, work); spin_unlock_irq(&lo->lo_work_lock); } static void loop_set_timer(struct loop_device *lo) { timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); } static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) { struct loop_worker *pos, *worker; spin_lock_irq(&lo->lo_work_lock); list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, idle_list) { if (!delete_all && time_is_after_jiffies(worker->last_ran_at + LOOP_IDLE_WORKER_TIMEOUT)) break; list_del(&worker->idle_list); rb_erase(&worker->rb_node, &lo->worker_tree); css_put(worker->blkcg_css); kfree(worker); } if (!list_empty(&lo->idle_worker_list)) loop_set_timer(lo); spin_unlock_irq(&lo->lo_work_lock); } static void loop_free_idle_workers_timer(struct timer_list *timer) { struct loop_device *lo = container_of(timer, struct loop_device, timer); return loop_free_idle_workers(lo, false); } /** * loop_set_status_from_info - configure device from loop_info * @lo: struct loop_device to configure * @info: struct loop_info64 to configure the device with * * Configures the loop device parameters according to the passed * in loop_info64 configuration. */ static int loop_set_status_from_info(struct loop_device *lo, const struct loop_info64 *info) { if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; switch (info->lo_encrypt_type) { case LO_CRYPT_NONE: break; case LO_CRYPT_XOR: pr_warn("support for the xor transformation has been removed.\n"); return -EINVAL; case LO_CRYPT_CRYPTOAPI: pr_warn("support for cryptoloop has been removed. Use dm-crypt instead.\n"); return -EINVAL; default: return -EINVAL; } /* Avoid assigning overflow values */ if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX) return -EOVERFLOW; lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; return 0; } static unsigned int loop_default_blocksize(struct loop_device *lo) { /* In case of direct I/O, match underlying minimum I/O size */ if (lo->lo_flags & LO_FLAGS_DIRECT_IO) return lo->lo_min_dio_size; return SECTOR_SIZE; } static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, unsigned int bsize) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct block_device *backing_bdev = NULL; u32 granularity = 0, max_discard_sectors = 0; if (S_ISBLK(inode->i_mode)) backing_bdev = I_BDEV(inode); else if (inode->i_sb->s_bdev) backing_bdev = inode->i_sb->s_bdev; if (!bsize) bsize = loop_default_blocksize(lo); loop_get_discard_config(lo, &granularity, &max_discard_sectors); lim->logical_block_size = bsize; lim->physical_block_size = bsize; lim->io_min = bsize; lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim->features |= BLK_FEAT_WRITE_CACHE; if (backing_bdev && bdev_rot(backing_bdev)) lim->features |= BLK_FEAT_ROTATIONAL; lim->max_hw_discard_sectors = max_discard_sectors; lim->max_write_zeroes_sectors = max_discard_sectors; if (max_discard_sectors) lim->discard_granularity = granularity; else lim->discard_granularity = 0; } static int loop_configure(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, const struct loop_config *config) { struct file *file = fget(config->fd); struct queue_limits lim; int error; loff_t size; bool partscan; bool is_loop; if (!file) return -EBADF; error = loop_check_backing_file(file); if (error) { fput(file); return error; } is_loop = is_loop_device(file); /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. */ if (!(mode & BLK_OPEN_EXCL)) { error = bd_prepare_to_claim(bdev, loop_configure, NULL); if (error) goto out_putf; } error = loop_global_lock_killable(lo, is_loop); if (error) goto out_bdev; error = -EBUSY; if (lo->lo_state != Lo_unbound) goto out_unlock; error = loop_validate_file(file, bdev); if (error) goto out_unlock; if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; } error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; lo->lo_flags = config->info.lo_flags; if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; if (!lo->workqueue) { lo->workqueue = alloc_workqueue("loop%d", WQ_UNBOUND | WQ_FREEZABLE, 0, lo->lo_number); if (!lo->workqueue) { error = -ENOMEM; goto out_unlock; } } /* suppress uevents while reconfiguring the device */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); disk_force_media_change(lo->lo_disk); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->lo_device = bdev; loop_assign_backing_file(lo, file); lim = queue_limits_start_update(lo->lo_queue); loop_update_limits(lo, &lim, config->block_size); /* No need to freeze the queue as the device isn't bound yet. */ error = queue_limits_commit_update(lo->lo_queue, &lim); if (error) goto out_unlock; /* * We might switch to direct I/O mode for the loop device, write back * all dirty data the page cache now that so that the individual I/O * operations don't have to do that. */ vfs_fsync(file, 0); loop_update_dio(lo); loop_sysfs_init(lo); size = lo_calculate_size(lo, file); loop_set_size(lo, size); /* Order wrt reading lo_state in loop_validate_file(). */ wmb(); WRITE_ONCE(lo->lo_state, Lo_bound); if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); return 0; out_unlock: loop_global_unlock(lo, is_loop); out_bdev: if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); out_putf: fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; } static void __loop_clr_fd(struct loop_device *lo) { struct queue_limits lim; struct file *filp; gfp_t gfp = lo->old_gfp_mask; spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); lo->lo_device = NULL; lo->lo_offset = 0; lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); /* * Reset the block size to the default. * * No queue freezing needed because this is called from the final * ->release call only, so there can't be any outstanding I/O. */ lim = queue_limits_start_update(lo->lo_queue); lim.logical_block_size = SECTOR_SIZE; lim.physical_block_size = SECTOR_SIZE; lim.io_min = SECTOR_SIZE; queue_limits_commit_update(lo->lo_queue, &lim); invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); disk_force_media_change(lo->lo_disk); if (lo->lo_flags & LO_FLAGS_PARTSCAN) { int err; /* * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. * * If the reread partition isn't from release path, lo_refcnt * must be at least one and it can only become zero when the * current holder is released. */ err = bdev_disk_changed(lo->lo_disk, false); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ } /* * lo->lo_state is set to Lo_unbound here after above partscan has * finished. There cannot be anybody else entering __loop_clr_fd() as * Lo_rundown state protects us from all the other places trying to * change the 'lo' device. */ lo->lo_flags = 0; if (!part_shift) set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); mutex_lock(&lo->lo_mutex); WRITE_ONCE(lo->lo_state, Lo_unbound); mutex_unlock(&lo->lo_mutex); /* * Need not hold lo_mutex to fput backing file. Calling fput holding * lo_mutex triggers a circular lock dependency possibility warning as * fput can take open_mutex which is usually taken before lo_mutex. */ fput(filp); } static int loop_clr_fd(struct loop_device *lo) { int err; /* * Since lo_ioctl() is called without locks held, it is possible that * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. * * Therefore, use global lock when setting Lo_rundown state in order to * make sure that loop_validate_file() will fail if the "struct file" * which loop_configure()/loop_change_fd() found via fget() was this * loop device. */ err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { loop_global_unlock(lo, true); return -ENXIO; } /* * Mark the device for removing the backing device on last close. * If we are the only opener, also switch the state to roundown here to * prevent new openers from coming in. */ lo->lo_flags |= LO_FLAGS_AUTOCLEAR; if (disk_openers(lo->lo_disk) == 1) WRITE_ONCE(lo->lo_state, Lo_rundown); loop_global_unlock(lo, true); return 0; } static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; bool partscan = false; bool size_changed = false; unsigned int memflags; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; if (lo->lo_state != Lo_bound) { err = -ENXIO; goto out_unlock; } if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { size_changed = true; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); } /* I/O needs to be drained before changing lo_offset or lo_sizelimit */ memflags = blk_mq_freeze_queue(lo->lo_queue); err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) && (info->lo_flags & LO_FLAGS_PARTSCAN); lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS; lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); /* update the direct I/O flag if lo_offset changed */ loop_update_dio(lo); out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue, memflags); if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); if (!err && size_changed) { loff_t new_size = lo_calculate_size(lo, lo->lo_backing_file); loop_set_size(lo, new_size); } out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) loop_reread_partitions(lo); return err; } static int loop_get_status(struct loop_device *lo, struct loop_info64 *info) { struct path path; struct kstat stat; int ret; ret = mutex_lock_killable(&lo->lo_mutex); if (ret) return ret; if (lo->lo_state != Lo_bound) { mutex_unlock(&lo->lo_mutex); return -ENXIO; } memset(info, 0, sizeof(*info)); info->lo_number = lo->lo_number; info->lo_offset = lo->lo_offset; info->lo_sizelimit = lo->lo_sizelimit; info->lo_flags = lo->lo_flags; memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); /* Drop lo_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; path_get(&path); mutex_unlock(&lo->lo_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); info->lo_inode = stat.ino; info->lo_rdevice = huge_encode_dev(stat.rdev); } path_put(&path); return ret; } static void loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) { memset(info64, 0, sizeof(*info64)); info64->lo_number = info->lo_number; info64->lo_device = info->lo_device; info64->lo_inode = info->lo_inode; info64->lo_rdevice = info->lo_rdevice; info64->lo_offset = info->lo_offset; info64->lo_sizelimit = 0; info64->lo_flags = info->lo_flags; memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); } static int loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) { memset(info, 0, sizeof(*info)); info->lo_number = info64->lo_number; info->lo_device = info64->lo_device; info->lo_inode = info64->lo_inode; info->lo_rdevice = info64->lo_rdevice; info->lo_offset = info64->lo_offset; info->lo_flags = info64->lo_flags; memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info->lo_device != info64->lo_device || info->lo_rdevice != info64->lo_rdevice || info->lo_inode != info64->lo_inode || info->lo_offset != info64->lo_offset) return -EOVERFLOW; return 0; } static int loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) { struct loop_info info; struct loop_info64 info64; if (copy_from_user(&info, arg, sizeof (struct loop_info))) return -EFAULT; loop_info64_from_old(&info, &info64); return loop_set_status(lo, &info64); } static int loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) { struct loop_info64 info64; if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) return -EFAULT; return loop_set_status(lo, &info64); } static int loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { struct loop_info info; struct loop_info64 info64; int err; if (!arg) return -EINVAL; err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_old(&info64, &info); if (!err && copy_to_user(arg, &info, sizeof(info))) err = -EFAULT; return err; } static int loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { struct loop_info64 info64; int err; if (!arg) return -EINVAL; err = loop_get_status(lo, &info64); if (!err && copy_to_user(arg, &info64, sizeof(info64))) err = -EFAULT; return err; } static int loop_set_capacity(struct loop_device *lo) { loff_t size; if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; size = lo_calculate_size(lo, lo->lo_backing_file); loop_set_size(lo, size); return 0; } static int loop_set_dio(struct loop_device *lo, unsigned long arg) { bool use_dio = !!arg; unsigned int memflags; if (lo->lo_state != Lo_bound) return -ENXIO; if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO)) return 0; if (use_dio) { if (!lo_can_use_dio(lo)) return -EINVAL; /* flush dirty pages before starting to use direct I/O */ vfs_fsync(lo->lo_backing_file, 0); } memflags = blk_mq_freeze_queue(lo->lo_queue); if (use_dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; else lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; blk_mq_unfreeze_queue(lo->lo_queue, memflags); return 0; } static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, unsigned long arg) { struct queue_limits lim; unsigned int memflags; int err = 0; /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. */ if (!(mode & BLK_OPEN_EXCL)) { err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL); if (err) return err; } err = mutex_lock_killable(&lo->lo_mutex); if (err) goto abort_claim; if (lo->lo_state != Lo_bound) { err = -ENXIO; goto unlock; } if (lo->lo_queue->limits.logical_block_size == arg) goto unlock; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); lim = queue_limits_start_update(lo->lo_queue); loop_update_limits(lo, &lim, arg); memflags = blk_mq_freeze_queue(lo->lo_queue); err = queue_limits_commit_update(lo->lo_queue, &lim); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue, memflags); unlock: mutex_unlock(&lo->lo_mutex); abort_claim: if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_set_block_size); return err; } static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, unsigned long arg) { int err; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; switch (cmd) { case LOOP_SET_CAPACITY: err = loop_set_capacity(lo); break; case LOOP_SET_DIRECT_IO: err = loop_set_dio(lo, arg); break; default: err = -EINVAL; } mutex_unlock(&lo->lo_mutex); return err; } static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; void __user *argp = (void __user *) arg; int err; switch (cmd) { case LOOP_SET_FD: { /* * Legacy case - pass in a zeroed out struct loop_config with * only the file descriptor set , which corresponds with the * default parameters we'd have used otherwise. */ struct loop_config config; memset(&config, 0, sizeof(config)); config.fd = arg; return loop_configure(lo, mode, bdev, &config); } case LOOP_CONFIGURE: { struct loop_config config; if (copy_from_user(&config, argp, sizeof(config))) return -EFAULT; return loop_configure(lo, mode, bdev, &config); } case LOOP_CHANGE_FD: return loop_change_fd(lo, bdev, arg); case LOOP_CLR_FD: return loop_clr_fd(lo); case LOOP_SET_STATUS: err = -EPERM; if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status_old(lo, argp); break; case LOOP_GET_STATUS: return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status64(lo, argp); break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); case LOOP_SET_BLOCK_SIZE: if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; return loop_set_block_size(lo, mode, bdev, arg); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; default: err = lo_simple_ioctl(lo, cmd, arg); break; } return err; } #ifdef CONFIG_COMPAT struct compat_loop_info { compat_int_t lo_number; /* ioctl r/o */ compat_dev_t lo_device; /* ioctl r/o */ compat_ulong_t lo_inode; /* ioctl r/o */ compat_dev_t lo_rdevice; /* ioctl r/o */ compat_int_t lo_offset; compat_int_t lo_encrypt_type; /* obsolete, ignored */ compat_int_t lo_encrypt_key_size; /* ioctl w/o */ compat_int_t lo_flags; /* ioctl r/o */ char lo_name[LO_NAME_SIZE]; unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ compat_ulong_t lo_init[2]; char reserved[4]; }; /* * Transfer 32-bit compatibility structure in userspace to 64-bit loop info * - noinlined to reduce stack space usage in main part of driver */ static noinline int loop_info64_from_compat(const struct compat_loop_info __user *arg, struct loop_info64 *info64) { struct compat_loop_info info; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; memset(info64, 0, sizeof(*info64)); info64->lo_number = info.lo_number; info64->lo_device = info.lo_device; info64->lo_inode = info.lo_inode; info64->lo_rdevice = info.lo_rdevice; info64->lo_offset = info.lo_offset; info64->lo_sizelimit = 0; info64->lo_flags = info.lo_flags; memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); return 0; } /* * Transfer 64-bit loop info to 32-bit compatibility structure in userspace * - noinlined to reduce stack space usage in main part of driver */ static noinline int loop_info64_to_compat(const struct loop_info64 *info64, struct compat_loop_info __user *arg) { struct compat_loop_info info; memset(&info, 0, sizeof(info)); info.lo_number = info64->lo_number; info.lo_device = info64->lo_device; info.lo_inode = info64->lo_inode; info.lo_rdevice = info64->lo_rdevice; info.lo_offset = info64->lo_offset; info.lo_flags = info64->lo_flags; memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info.lo_device != info64->lo_device || info.lo_rdevice != info64->lo_rdevice || info.lo_inode != info64->lo_inode || info.lo_offset != info64->lo_offset) return -EOVERFLOW; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; return 0; } static int loop_set_status_compat(struct loop_device *lo, const struct compat_loop_info __user *arg) { struct loop_info64 info64; int ret; ret = loop_info64_from_compat(arg, &info64); if (ret < 0) return ret; return loop_set_status(lo, &info64); } static int loop_get_status_compat(struct loop_device *lo, struct compat_loop_info __user *arg) { struct loop_info64 info64; int err; if (!arg) return -EINVAL; err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_compat(&info64, arg); return err; } static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; int err; switch(cmd) { case LOOP_SET_STATUS: err = loop_set_status_compat(lo, (const struct compat_loop_info __user *)arg); break; case LOOP_GET_STATUS: err = loop_get_status_compat(lo, (struct compat_loop_info __user *)arg); break; case LOOP_SET_CAPACITY: case LOOP_CLR_FD: case LOOP_GET_STATUS64: case LOOP_SET_STATUS64: case LOOP_CONFIGURE: arg = (unsigned long) compat_ptr(arg); fallthrough; case LOOP_SET_FD: case LOOP_CHANGE_FD: case LOOP_SET_BLOCK_SIZE: case LOOP_SET_DIRECT_IO: err = lo_ioctl(bdev, mode, cmd, arg); break; default: err = -ENOIOCTLCMD; break; } return err; } #endif static int lo_open(struct gendisk *disk, blk_mode_t mode) { struct loop_device *lo = disk->private_data; int err; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) err = -ENXIO; mutex_unlock(&lo->lo_mutex); return err; } static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; bool need_clear = false; if (disk_openers(disk) > 0) return; /* * Clear the backing device information if this is the last close of * a device that's been marked for auto clear, or on which LOOP_CLR_FD * has been called. */ mutex_lock(&lo->lo_mutex); if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) WRITE_ONCE(lo->lo_state, Lo_rundown); need_clear = (lo->lo_state == Lo_rundown); mutex_unlock(&lo->lo_mutex); if (need_clear) __loop_clr_fd(lo); } static void lo_free_disk(struct gendisk *disk) { struct loop_device *lo = disk->private_data; if (lo->workqueue) destroy_workqueue(lo->workqueue); loop_free_idle_workers(lo, true); timer_shutdown_sync(&lo->timer); mutex_destroy(&lo->lo_mutex); kfree(lo); } static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif .free_disk = lo_free_disk, }; /* * And now the modules code and kernel interface. */ /* * If max_loop is specified, create that many devices upfront. * This also becomes a hard limit. If max_loop is not specified, * the default isn't a hard limit (as before commit 85c50197716c * changed the default value from 0 for max_loop=0 reasons), just * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module * init time. Loop devices can be requested on-demand with the * /dev/loop-control interface, or be instantiated by accessing * a 'dead' device node. */ static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static bool max_loop_specified; static int max_loop_param_set_int(const char *val, const struct kernel_param *kp) { int ret; ret = param_set_int(val, kp); if (ret < 0) return ret; max_loop_specified = true; return 0; } static const struct kernel_param_ops max_loop_param_ops = { .set = max_loop_param_set_int, .get = param_get_int, }; module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); #else module_param(max_loop, int, 0444); MODULE_PARM_DESC(max_loop, "Initial number of loop devices"); #endif module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) { int qd, ret; ret = kstrtoint(s, 0, &qd); if (ret < 0) return ret; if (qd < 1) return -EINVAL; hw_queue_depth = qd; return 0; } static const struct kernel_param_ops loop_hw_qdepth_param_ops = { .set = loop_set_hw_queue_depth, .get = param_get_int, }; device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); MODULE_DESCRIPTION("Loopback device support"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct loop_device *lo = rq->q->queuedata; blk_mq_start_request(rq); if (data_race(READ_ONCE(lo->lo_state)) != Lo_bound) return BLK_STS_IOERR; switch (req_op(rq)) { case REQ_OP_FLUSH: case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: cmd->use_aio = false; break; default: cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; } /* always use the first bio's css */ cmd->blkcg_css = NULL; cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP if (rq->bio) { cmd->blkcg_css = bio_blkcg_css(rq->bio); #ifdef CONFIG_MEMCG if (cmd->blkcg_css) { cmd->memcg_css = cgroup_get_e_css(cmd->blkcg_css->cgroup, &memory_cgrp_subsys); } #endif } #endif loop_queue_work(lo, cmd); return BLK_STS_OK; } static void loop_handle_cmd(struct loop_cmd *cmd) { struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css; struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css; struct request *rq = blk_mq_rq_from_pdu(cmd); const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; int ret = 0; struct mem_cgroup *old_memcg = NULL; if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { ret = -EIO; goto failed; } /* We can block in this context, so ignore REQ_NOWAIT. */ if (rq->cmd_flags & REQ_NOWAIT) rq->cmd_flags &= ~REQ_NOWAIT; if (cmd_blkcg_css) kthread_associate_blkcg(cmd_blkcg_css); if (cmd_memcg_css) old_memcg = set_active_memcg( mem_cgroup_from_css(cmd_memcg_css)); /* * do_req_filebacked() may call blk_mq_complete_request() synchronously * or asynchronously if using aio. Hence, do not touch 'cmd' after * do_req_filebacked() has returned unless we are sure that 'cmd' has * not yet been completed. */ ret = do_req_filebacked(lo, rq); if (cmd_blkcg_css) kthread_associate_blkcg(NULL); if (cmd_memcg_css) { set_active_memcg(old_memcg); css_put(cmd_memcg_css); } failed: /* complete non-aio request */ if (ret != -EIOCBQUEUED) { if (ret == -EOPNOTSUPP) cmd->ret = ret; else cmd->ret = ret ? -EIO : 0; if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } } static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { int orig_flags = current->flags; struct loop_cmd *cmd; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; spin_lock_irq(&lo->lo_work_lock); while (!list_empty(cmd_list)) { cmd = container_of( cmd_list->next, struct loop_cmd, list_entry); list_del(cmd_list->next); spin_unlock_irq(&lo->lo_work_lock); loop_handle_cmd(cmd); cond_resched(); spin_lock_irq(&lo->lo_work_lock); } /* * We only add to the idle list if there are no pending cmds * *and* the worker will not run again which ensures that it * is safe to free any worker on the idle list */ if (worker && !work_pending(&worker->work)) { worker->last_ran_at = jiffies; list_add_tail(&worker->idle_list, &lo->idle_worker_list); loop_set_timer(lo); } spin_unlock_irq(&lo->lo_work_lock); current->flags = orig_flags; } static void loop_workfn(struct work_struct *work) { struct loop_worker *worker = container_of(work, struct loop_worker, work); loop_process_work(worker, &worker->cmd_list, worker->lo); } static void loop_rootcg_workfn(struct work_struct *work) { struct loop_device *lo = container_of(work, struct loop_device, rootcg_work); loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, }; static int loop_add(int i) { struct queue_limits lim = { /* * Random number picked from the historic block max_sectors cap. */ .max_hw_sectors = 2560u, }; struct loop_device *lo; struct gendisk *disk; int err; err = -ENOMEM; lo = kzalloc_obj(*lo); if (!lo) goto out; lo->worker_tree = RB_ROOT; INIT_LIST_HEAD(&lo->idle_worker_list); timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); WRITE_ONCE(lo->lo_state, Lo_unbound); err = mutex_lock_killable(&loop_ctl_mutex); if (err) goto out_free_dev; /* allocate id, if @id >= 0, we're requesting that specific id */ if (i >= 0) { err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); if (err == -ENOSPC) err = -EEXIST; } else { err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); } mutex_unlock(&loop_ctl_mutex); if (err < 0) goto out_free_dev; i = err; lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); if (err) goto out_free_idr; disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } lo->lo_queue = lo->lo_disk->queue; /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its * setup. Userspace can always add and remove partitions from all * devices. The needed partition minors are allocated from the * extended minor space, the main loop device numbers will continue * to match the loop minors, regardless of the number of partitions * used. * * If max_part is given, partition scanning is globally enabled for * all loop devices. The minors for the main loop devices will be * multiples of max_part. * * Note: Global-for-all-devices, set-only-at-init, read-only module * parameteters like 'max_loop' and 'max_part' make things needlessly * complicated, are too static, inflexible and may surprise * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); /* Make this loop device reachable from pathname. */ err = add_disk(disk); if (err) goto out_cleanup_disk; /* Show this loop device. */ mutex_lock(&loop_ctl_mutex); lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); return i; out_cleanup_disk: put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, i); mutex_unlock(&loop_ctl_mutex); out_free_dev: kfree(lo); out: return err; } static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); blk_mq_free_tag_set(&lo->tag_set); mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); put_disk(lo->lo_disk); } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static void loop_probe(dev_t dev) { int idx = MINOR(dev) >> part_shift; if (max_loop_specified && max_loop && idx >= max_loop) return; loop_add(idx); } #else #define loop_probe NULL #endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */ static int loop_control_remove(int idx) { struct loop_device *lo; int ret; if (idx < 0) { pr_warn_once("deleting an unspecified loop device is not supported.\n"); return -EINVAL; } /* Hide this loop device for serialization. */ ret = mutex_lock_killable(&loop_ctl_mutex); if (ret) return ret; lo = idr_find(&loop_index_idr, idx); if (!lo || !lo->idr_visible) ret = -ENODEV; else lo->idr_visible = false; mutex_unlock(&loop_ctl_mutex); if (ret) return ret; /* Check whether this loop device can be removed. */ ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } /* Mark this loop device as no more bound, but not quite unbound yet */ WRITE_ONCE(lo->lo_state, Lo_deleting); mutex_unlock(&lo->lo_mutex); loop_remove(lo); return 0; mark_visible: /* Show this loop device again. */ mutex_lock(&loop_ctl_mutex); lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); return ret; } static int loop_control_get_free(int idx) { struct loop_device *lo; int id, ret; ret = mutex_lock_killable(&loop_ctl_mutex); if (ret) return ret; idr_for_each_entry(&loop_index_idr, lo, id) { /* * Hitting a race results in creating a new loop device * which is harmless. */ if (lo->idr_visible && data_race(READ_ONCE(lo->lo_state)) == Lo_unbound) goto found; } mutex_unlock(&loop_ctl_mutex); return loop_add(-1); found: mutex_unlock(&loop_ctl_mutex); return id; } static long loop_control_ioctl(struct file *file, unsigned int cmd, unsigned long parm) { switch (cmd) { case LOOP_CTL_ADD: return loop_add(parm); case LOOP_CTL_REMOVE: return loop_control_remove(parm); case LOOP_CTL_GET_FREE: return loop_control_get_free(parm); default: return -ENOSYS; } } static const struct file_operations loop_ctl_fops = { .open = nonseekable_open, .unlocked_ioctl = loop_control_ioctl, .compat_ioctl = loop_control_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice loop_misc = { .minor = LOOP_CTRL_MINOR, .name = "loop-control", .fops = &loop_ctl_fops, }; MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR); MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { int i; int err; part_shift = 0; if (max_part > 0) { part_shift = fls(max_part); /* * Adjust max_part according to part_shift as it is exported * to user space so that user can decide correct minor number * if [s]he want to create more devices. * * Note that -1 is required because partition 0 is reserved * for the whole disk. */ max_part = (1UL << part_shift) - 1; } if ((1UL << part_shift) > DISK_MAX_PARTS) { err = -EINVAL; goto err_out; } if (max_loop > 1UL << (MINORBITS - part_shift)) { err = -EINVAL; goto err_out; } err = misc_register(&loop_misc); if (err < 0) goto err_out; if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) { err = -EIO; goto misc_out; } /* pre-create number of devices given by config or max_loop */ for (i = 0; i < max_loop; i++) loop_add(i); printk(KERN_INFO "loop: module loaded\n"); return 0; misc_out: misc_deregister(&loop_misc); err_out: return err; } static void __exit loop_exit(void) { struct loop_device *lo; int id; unregister_blkdev(LOOP_MAJOR, "loop"); misc_deregister(&loop_misc); /* * There is no need to use loop_ctl_mutex here, for nobody else can * access loop_index_idr when this module is unloading (unless forced * module unloading is requested). If this is not a clean unloading, * we have no means to avoid kernel crash. */ idr_for_each_entry(&loop_index_idr, lo, id) loop_remove(lo); idr_destroy(&loop_index_idr); } module_init(loop_init); module_exit(loop_exit); #ifndef MODULE static int __init max_loop_setup(char *str) { max_loop = simple_strtol(str, NULL, 0); #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD max_loop_specified = true; #endif return 1; } __setup("max_loop=", max_loop_setup); #endif |
| 6 7 7 7 7 7 7 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * DES & Triple DES EDE Cipher Algorithms. * * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no> */ #include <crypto/des.h> #include <crypto/internal/des.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> #include <linux/unaligned.h> #define ROL(x, r) ((x) = rol32((x), (r))) #define ROR(x, r) ((x) = ror32((x), (r))) /* Lookup tables for key expansion */ static const u8 pc1[256] = { 0x00, 0x00, 0x40, 0x04, 0x10, 0x10, 0x50, 0x14, 0x04, 0x40, 0x44, 0x44, 0x14, 0x50, 0x54, 0x54, 0x02, 0x02, 0x42, 0x06, 0x12, 0x12, 0x52, 0x16, 0x06, 0x42, 0x46, 0x46, 0x16, 0x52, 0x56, 0x56, 0x80, 0x08, 0xc0, 0x0c, 0x90, 0x18, 0xd0, 0x1c, 0x84, 0x48, 0xc4, 0x4c, 0x94, 0x58, 0xd4, 0x5c, 0x82, 0x0a, 0xc2, 0x0e, 0x92, 0x1a, 0xd2, 0x1e, 0x86, 0x4a, 0xc6, 0x4e, 0x96, 0x5a, 0xd6, 0x5e, 0x20, 0x20, 0x60, 0x24, 0x30, 0x30, 0x70, 0x34, 0x24, 0x60, 0x64, 0x64, 0x34, 0x70, 0x74, 0x74, 0x22, 0x22, 0x62, 0x26, 0x32, 0x32, 0x72, 0x36, 0x26, 0x62, 0x66, 0x66, 0x36, 0x72, 0x76, 0x76, 0xa0, 0x28, 0xe0, 0x2c, 0xb0, 0x38, 0xf0, 0x3c, 0xa4, 0x68, 0xe4, 0x6c, 0xb4, 0x78, 0xf4, 0x7c, 0xa2, 0x2a, 0xe2, 0x2e, 0xb2, 0x3a, 0xf2, 0x3e, 0xa6, 0x6a, 0xe6, 0x6e, 0xb6, 0x7a, 0xf6, 0x7e, 0x08, 0x80, 0x48, 0x84, 0x18, 0x90, 0x58, 0x94, 0x0c, 0xc0, 0x4c, 0xc4, 0x1c, 0xd0, 0x5c, 0xd4, 0x0a, 0x82, 0x4a, 0x86, 0x1a, 0x92, 0x5a, 0x96, 0x0e, 0xc2, 0x4e, 0xc6, 0x1e, 0xd2, 0x5e, 0xd6, 0x88, 0x88, 0xc8, 0x8c, 0x98, 0x98, 0xd8, 0x9c, 0x8c, 0xc8, 0xcc, 0xcc, 0x9c, 0xd8, 0xdc, 0xdc, 0x8a, 0x8a, 0xca, 0x8e, 0x9a, 0x9a, 0xda, 0x9e, 0x8e, 0xca, 0xce, 0xce, 0x9e, 0xda, 0xde, 0xde, 0x28, 0xa0, 0x68, 0xa4, 0x38, 0xb0, 0x78, 0xb4, 0x2c, 0xe0, 0x6c, 0xe4, 0x3c, 0xf0, 0x7c, 0xf4, 0x2a, 0xa2, 0x6a, 0xa6, 0x3a, 0xb2, 0x7a, 0xb6, 0x2e, 0xe2, 0x6e, 0xe6, 0x3e, 0xf2, 0x7e, 0xf6, 0xa8, 0xa8, 0xe8, 0xac, 0xb8, 0xb8, 0xf8, 0xbc, 0xac, 0xe8, 0xec, 0xec, 0xbc, 0xf8, 0xfc, 0xfc, 0xaa, 0xaa, 0xea, 0xae, 0xba, 0xba, 0xfa, 0xbe, 0xae, 0xea, 0xee, 0xee, 0xbe, 0xfa, 0xfe, 0xfe }; static const u8 rs[256] = { 0x00, 0x00, 0x80, 0x80, 0x02, 0x02, 0x82, 0x82, 0x04, 0x04, 0x84, 0x84, 0x06, 0x06, 0x86, 0x86, 0x08, 0x08, 0x88, 0x88, 0x0a, 0x0a, 0x8a, 0x8a, 0x0c, 0x0c, 0x8c, 0x8c, 0x0e, 0x0e, 0x8e, 0x8e, 0x10, 0x10, 0x90, 0x90, 0x12, 0x12, 0x92, 0x92, 0x14, 0x14, 0x94, 0x94, 0x16, 0x16, 0x96, 0x96, 0x18, 0x18, 0x98, 0x98, 0x1a, 0x1a, 0x9a, 0x9a, 0x1c, 0x1c, 0x9c, 0x9c, 0x1e, 0x1e, 0x9e, 0x9e, 0x20, 0x20, 0xa0, 0xa0, 0x22, 0x22, 0xa2, 0xa2, 0x24, 0x24, 0xa4, 0xa4, 0x26, 0x26, 0xa6, 0xa6, 0x28, 0x28, 0xa8, 0xa8, 0x2a, 0x2a, 0xaa, 0xaa, 0x2c, 0x2c, 0xac, 0xac, 0x2e, 0x2e, 0xae, 0xae, 0x30, 0x30, 0xb0, 0xb0, 0x32, 0x32, 0xb2, 0xb2, 0x34, 0x34, 0xb4, 0xb4, 0x36, 0x36, 0xb6, 0xb6, 0x38, 0x38, 0xb8, 0xb8, 0x3a, 0x3a, 0xba, 0xba, 0x3c, 0x3c, 0xbc, 0xbc, 0x3e, 0x3e, 0xbe, 0xbe, 0x40, 0x40, 0xc0, 0xc0, 0x42, 0x42, 0xc2, 0xc2, 0x44, 0x44, 0xc4, 0xc4, 0x46, 0x46, 0xc6, 0xc6, 0x48, 0x48, 0xc8, 0xc8, 0x4a, 0x4a, 0xca, 0xca, 0x4c, 0x4c, 0xcc, 0xcc, 0x4e, 0x4e, 0xce, 0xce, 0x50, 0x50, 0xd0, 0xd0, 0x52, 0x52, 0xd2, 0xd2, 0x54, 0x54, 0xd4, 0xd4, 0x56, 0x56, 0xd6, 0xd6, 0x58, 0x58, 0xd8, 0xd8, 0x5a, 0x5a, 0xda, 0xda, 0x5c, 0x5c, 0xdc, 0xdc, 0x5e, 0x5e, 0xde, 0xde, 0x60, 0x60, 0xe0, 0xe0, 0x62, 0x62, 0xe2, 0xe2, 0x64, 0x64, 0xe4, 0xe4, 0x66, 0x66, 0xe6, 0xe6, 0x68, 0x68, 0xe8, 0xe8, 0x6a, 0x6a, 0xea, 0xea, 0x6c, 0x6c, 0xec, 0xec, 0x6e, 0x6e, 0xee, 0xee, 0x70, 0x70, 0xf0, 0xf0, 0x72, 0x72, 0xf2, 0xf2, 0x74, 0x74, 0xf4, 0xf4, 0x76, 0x76, 0xf6, 0xf6, 0x78, 0x78, 0xf8, 0xf8, 0x7a, 0x7a, 0xfa, 0xfa, 0x7c, 0x7c, 0xfc, 0xfc, 0x7e, 0x7e, 0xfe, 0xfe }; static const u32 pc2[1024] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00040000, 0x00000000, 0x04000000, 0x00100000, 0x00400000, 0x00000008, 0x00000800, 0x40000000, 0x00440000, 0x00000008, 0x04000800, 0x40100000, 0x00000400, 0x00000020, 0x08000000, 0x00000100, 0x00040400, 0x00000020, 0x0c000000, 0x00100100, 0x00400400, 0x00000028, 0x08000800, 0x40000100, 0x00440400, 0x00000028, 0x0c000800, 0x40100100, 0x80000000, 0x00000010, 0x00000000, 0x00800000, 0x80040000, 0x00000010, 0x04000000, 0x00900000, 0x80400000, 0x00000018, 0x00000800, 0x40800000, 0x80440000, 0x00000018, 0x04000800, 0x40900000, 0x80000400, 0x00000030, 0x08000000, 0x00800100, 0x80040400, 0x00000030, 0x0c000000, 0x00900100, 0x80400400, 0x00000038, 0x08000800, 0x40800100, 0x80440400, 0x00000038, 0x0c000800, 0x40900100, 0x10000000, 0x00000000, 0x00200000, 0x00001000, 0x10040000, 0x00000000, 0x04200000, 0x00101000, 0x10400000, 0x00000008, 0x00200800, 0x40001000, 0x10440000, 0x00000008, 0x04200800, 0x40101000, 0x10000400, 0x00000020, 0x08200000, 0x00001100, 0x10040400, 0x00000020, 0x0c200000, 0x00101100, 0x10400400, 0x00000028, 0x08200800, 0x40001100, 0x10440400, 0x00000028, 0x0c200800, 0x40101100, 0x90000000, 0x00000010, 0x00200000, 0x00801000, 0x90040000, 0x00000010, 0x04200000, 0x00901000, 0x90400000, 0x00000018, 0x00200800, 0x40801000, 0x90440000, 0x00000018, 0x04200800, 0x40901000, 0x90000400, 0x00000030, 0x08200000, 0x00801100, 0x90040400, 0x00000030, 0x0c200000, 0x00901100, 0x90400400, 0x00000038, 0x08200800, 0x40801100, 0x90440400, 0x00000038, 0x0c200800, 0x40901100, 0x00000200, 0x00080000, 0x00000000, 0x00000004, 0x00040200, 0x00080000, 0x04000000, 0x00100004, 0x00400200, 0x00080008, 0x00000800, 0x40000004, 0x00440200, 0x00080008, 0x04000800, 0x40100004, 0x00000600, 0x00080020, 0x08000000, 0x00000104, 0x00040600, 0x00080020, 0x0c000000, 0x00100104, 0x00400600, 0x00080028, 0x08000800, 0x40000104, 0x00440600, 0x00080028, 0x0c000800, 0x40100104, 0x80000200, 0x00080010, 0x00000000, 0x00800004, 0x80040200, 0x00080010, 0x04000000, 0x00900004, 0x80400200, 0x00080018, 0x00000800, 0x40800004, 0x80440200, 0x00080018, 0x04000800, 0x40900004, 0x80000600, 0x00080030, 0x08000000, 0x00800104, 0x80040600, 0x00080030, 0x0c000000, 0x00900104, 0x80400600, 0x00080038, 0x08000800, 0x40800104, 0x80440600, 0x00080038, 0x0c000800, 0x40900104, 0x10000200, 0x00080000, 0x00200000, 0x00001004, 0x10040200, 0x00080000, 0x04200000, 0x00101004, 0x10400200, 0x00080008, 0x00200800, 0x40001004, 0x10440200, 0x00080008, 0x04200800, 0x40101004, 0x10000600, 0x00080020, 0x08200000, 0x00001104, 0x10040600, 0x00080020, 0x0c200000, 0x00101104, 0x10400600, 0x00080028, 0x08200800, 0x40001104, 0x10440600, 0x00080028, 0x0c200800, 0x40101104, 0x90000200, 0x00080010, 0x00200000, 0x00801004, 0x90040200, 0x00080010, 0x04200000, 0x00901004, 0x90400200, 0x00080018, 0x00200800, 0x40801004, 0x90440200, 0x00080018, 0x04200800, 0x40901004, 0x90000600, 0x00080030, 0x08200000, 0x00801104, 0x90040600, 0x00080030, 0x0c200000, 0x00901104, 0x90400600, 0x00080038, 0x08200800, 0x40801104, 0x90440600, 0x00080038, 0x0c200800, 0x40901104, 0x00000002, 0x00002000, 0x20000000, 0x00000001, 0x00040002, 0x00002000, 0x24000000, 0x00100001, 0x00400002, 0x00002008, 0x20000800, 0x40000001, 0x00440002, 0x00002008, 0x24000800, 0x40100001, 0x00000402, 0x00002020, 0x28000000, 0x00000101, 0x00040402, 0x00002020, 0x2c000000, 0x00100101, 0x00400402, 0x00002028, 0x28000800, 0x40000101, 0x00440402, 0x00002028, 0x2c000800, 0x40100101, 0x80000002, 0x00002010, 0x20000000, 0x00800001, 0x80040002, 0x00002010, 0x24000000, 0x00900001, 0x80400002, 0x00002018, 0x20000800, 0x40800001, 0x80440002, 0x00002018, 0x24000800, 0x40900001, 0x80000402, 0x00002030, 0x28000000, 0x00800101, 0x80040402, 0x00002030, 0x2c000000, 0x00900101, 0x80400402, 0x00002038, 0x28000800, 0x40800101, 0x80440402, 0x00002038, 0x2c000800, 0x40900101, 0x10000002, 0x00002000, 0x20200000, 0x00001001, 0x10040002, 0x00002000, 0x24200000, 0x00101001, 0x10400002, 0x00002008, 0x20200800, 0x40001001, 0x10440002, 0x00002008, 0x24200800, 0x40101001, 0x10000402, 0x00002020, 0x28200000, 0x00001101, 0x10040402, 0x00002020, 0x2c200000, 0x00101101, 0x10400402, 0x00002028, 0x28200800, 0x40001101, 0x10440402, 0x00002028, 0x2c200800, 0x40101101, 0x90000002, 0x00002010, 0x20200000, 0x00801001, 0x90040002, 0x00002010, 0x24200000, 0x00901001, 0x90400002, 0x00002018, 0x20200800, 0x40801001, 0x90440002, 0x00002018, 0x24200800, 0x40901001, 0x90000402, 0x00002030, 0x28200000, 0x00801101, 0x90040402, 0x00002030, 0x2c200000, 0x00901101, 0x90400402, 0x00002038, 0x28200800, 0x40801101, 0x90440402, 0x00002038, 0x2c200800, 0x40901101, 0x00000202, 0x00082000, 0x20000000, 0x00000005, 0x00040202, 0x00082000, 0x24000000, 0x00100005, 0x00400202, 0x00082008, 0x20000800, 0x40000005, 0x00440202, 0x00082008, 0x24000800, 0x40100005, 0x00000602, 0x00082020, 0x28000000, 0x00000105, 0x00040602, 0x00082020, 0x2c000000, 0x00100105, 0x00400602, 0x00082028, 0x28000800, 0x40000105, 0x00440602, 0x00082028, 0x2c000800, 0x40100105, 0x80000202, 0x00082010, 0x20000000, 0x00800005, 0x80040202, 0x00082010, 0x24000000, 0x00900005, 0x80400202, 0x00082018, 0x20000800, 0x40800005, 0x80440202, 0x00082018, 0x24000800, 0x40900005, 0x80000602, 0x00082030, 0x28000000, 0x00800105, 0x80040602, 0x00082030, 0x2c000000, 0x00900105, 0x80400602, 0x00082038, 0x28000800, 0x40800105, 0x80440602, 0x00082038, 0x2c000800, 0x40900105, 0x10000202, 0x00082000, 0x20200000, 0x00001005, 0x10040202, 0x00082000, 0x24200000, 0x00101005, 0x10400202, 0x00082008, 0x20200800, 0x40001005, 0x10440202, 0x00082008, 0x24200800, 0x40101005, 0x10000602, 0x00082020, 0x28200000, 0x00001105, 0x10040602, 0x00082020, 0x2c200000, 0x00101105, 0x10400602, 0x00082028, 0x28200800, 0x40001105, 0x10440602, 0x00082028, 0x2c200800, 0x40101105, 0x90000202, 0x00082010, 0x20200000, 0x00801005, 0x90040202, 0x00082010, 0x24200000, 0x00901005, 0x90400202, 0x00082018, 0x20200800, 0x40801005, 0x90440202, 0x00082018, 0x24200800, 0x40901005, 0x90000602, 0x00082030, 0x28200000, 0x00801105, 0x90040602, 0x00082030, 0x2c200000, 0x00901105, 0x90400602, 0x00082038, 0x28200800, 0x40801105, 0x90440602, 0x00082038, 0x2c200800, 0x40901105, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000008, 0x00080000, 0x10000000, 0x02000000, 0x00000000, 0x00000080, 0x00001000, 0x02000000, 0x00000008, 0x00080080, 0x10001000, 0x00004000, 0x00000000, 0x00000040, 0x00040000, 0x00004000, 0x00000008, 0x00080040, 0x10040000, 0x02004000, 0x00000000, 0x000000c0, 0x00041000, 0x02004000, 0x00000008, 0x000800c0, 0x10041000, 0x00020000, 0x00008000, 0x08000000, 0x00200000, 0x00020000, 0x00008008, 0x08080000, 0x10200000, 0x02020000, 0x00008000, 0x08000080, 0x00201000, 0x02020000, 0x00008008, 0x08080080, 0x10201000, 0x00024000, 0x00008000, 0x08000040, 0x00240000, 0x00024000, 0x00008008, 0x08080040, 0x10240000, 0x02024000, 0x00008000, 0x080000c0, 0x00241000, 0x02024000, 0x00008008, 0x080800c0, 0x10241000, 0x00000000, 0x01000000, 0x00002000, 0x00000020, 0x00000000, 0x01000008, 0x00082000, 0x10000020, 0x02000000, 0x01000000, 0x00002080, 0x00001020, 0x02000000, 0x01000008, 0x00082080, 0x10001020, 0x00004000, 0x01000000, 0x00002040, 0x00040020, 0x00004000, 0x01000008, 0x00082040, 0x10040020, 0x02004000, 0x01000000, 0x000020c0, 0x00041020, 0x02004000, 0x01000008, 0x000820c0, 0x10041020, 0x00020000, 0x01008000, 0x08002000, 0x00200020, 0x00020000, 0x01008008, 0x08082000, 0x10200020, 0x02020000, 0x01008000, 0x08002080, 0x00201020, 0x02020000, 0x01008008, 0x08082080, 0x10201020, 0x00024000, 0x01008000, 0x08002040, 0x00240020, 0x00024000, 0x01008008, 0x08082040, 0x10240020, 0x02024000, 0x01008000, 0x080020c0, 0x00241020, 0x02024000, 0x01008008, 0x080820c0, 0x10241020, 0x00000400, 0x04000000, 0x00100000, 0x00000004, 0x00000400, 0x04000008, 0x00180000, 0x10000004, 0x02000400, 0x04000000, 0x00100080, 0x00001004, 0x02000400, 0x04000008, 0x00180080, 0x10001004, 0x00004400, 0x04000000, 0x00100040, 0x00040004, 0x00004400, 0x04000008, 0x00180040, 0x10040004, 0x02004400, 0x04000000, 0x001000c0, 0x00041004, 0x02004400, 0x04000008, 0x001800c0, 0x10041004, 0x00020400, 0x04008000, 0x08100000, 0x00200004, 0x00020400, 0x04008008, 0x08180000, 0x10200004, 0x02020400, 0x04008000, 0x08100080, 0x00201004, 0x02020400, 0x04008008, 0x08180080, 0x10201004, 0x00024400, 0x04008000, 0x08100040, 0x00240004, 0x00024400, 0x04008008, 0x08180040, 0x10240004, 0x02024400, 0x04008000, 0x081000c0, 0x00241004, 0x02024400, 0x04008008, 0x081800c0, 0x10241004, 0x00000400, 0x05000000, 0x00102000, 0x00000024, 0x00000400, 0x05000008, 0x00182000, 0x10000024, 0x02000400, 0x05000000, 0x00102080, 0x00001024, 0x02000400, 0x05000008, 0x00182080, 0x10001024, 0x00004400, 0x05000000, 0x00102040, 0x00040024, 0x00004400, 0x05000008, 0x00182040, 0x10040024, 0x02004400, 0x05000000, 0x001020c0, 0x00041024, 0x02004400, 0x05000008, 0x001820c0, 0x10041024, 0x00020400, 0x05008000, 0x08102000, 0x00200024, 0x00020400, 0x05008008, 0x08182000, 0x10200024, 0x02020400, 0x05008000, 0x08102080, 0x00201024, 0x02020400, 0x05008008, 0x08182080, 0x10201024, 0x00024400, 0x05008000, 0x08102040, 0x00240024, 0x00024400, 0x05008008, 0x08182040, 0x10240024, 0x02024400, 0x05008000, 0x081020c0, 0x00241024, 0x02024400, 0x05008008, 0x081820c0, 0x10241024, 0x00000800, 0x00010000, 0x20000000, 0x00000010, 0x00000800, 0x00010008, 0x20080000, 0x10000010, 0x02000800, 0x00010000, 0x20000080, 0x00001010, 0x02000800, 0x00010008, 0x20080080, 0x10001010, 0x00004800, 0x00010000, 0x20000040, 0x00040010, 0x00004800, 0x00010008, 0x20080040, 0x10040010, 0x02004800, 0x00010000, 0x200000c0, 0x00041010, 0x02004800, 0x00010008, 0x200800c0, 0x10041010, 0x00020800, 0x00018000, 0x28000000, 0x00200010, 0x00020800, 0x00018008, 0x28080000, 0x10200010, 0x02020800, 0x00018000, 0x28000080, 0x00201010, 0x02020800, 0x00018008, 0x28080080, 0x10201010, 0x00024800, 0x00018000, 0x28000040, 0x00240010, 0x00024800, 0x00018008, 0x28080040, 0x10240010, 0x02024800, 0x00018000, 0x280000c0, 0x00241010, 0x02024800, 0x00018008, 0x280800c0, 0x10241010, 0x00000800, 0x01010000, 0x20002000, 0x00000030, 0x00000800, 0x01010008, 0x20082000, 0x10000030, 0x02000800, 0x01010000, 0x20002080, 0x00001030, 0x02000800, 0x01010008, 0x20082080, 0x10001030, 0x00004800, 0x01010000, 0x20002040, 0x00040030, 0x00004800, 0x01010008, 0x20082040, 0x10040030, 0x02004800, 0x01010000, 0x200020c0, 0x00041030, 0x02004800, 0x01010008, 0x200820c0, 0x10041030, 0x00020800, 0x01018000, 0x28002000, 0x00200030, 0x00020800, 0x01018008, 0x28082000, 0x10200030, 0x02020800, 0x01018000, 0x28002080, 0x00201030, 0x02020800, 0x01018008, 0x28082080, 0x10201030, 0x00024800, 0x01018000, 0x28002040, 0x00240030, 0x00024800, 0x01018008, 0x28082040, 0x10240030, 0x02024800, 0x01018000, 0x280020c0, 0x00241030, 0x02024800, 0x01018008, 0x280820c0, 0x10241030, 0x00000c00, 0x04010000, 0x20100000, 0x00000014, 0x00000c00, 0x04010008, 0x20180000, 0x10000014, 0x02000c00, 0x04010000, 0x20100080, 0x00001014, 0x02000c00, 0x04010008, 0x20180080, 0x10001014, 0x00004c00, 0x04010000, 0x20100040, 0x00040014, 0x00004c00, 0x04010008, 0x20180040, 0x10040014, 0x02004c00, 0x04010000, 0x201000c0, 0x00041014, 0x02004c00, 0x04010008, 0x201800c0, 0x10041014, 0x00020c00, 0x04018000, 0x28100000, 0x00200014, 0x00020c00, 0x04018008, 0x28180000, 0x10200014, 0x02020c00, 0x04018000, 0x28100080, 0x00201014, 0x02020c00, 0x04018008, 0x28180080, 0x10201014, 0x00024c00, 0x04018000, 0x28100040, 0x00240014, 0x00024c00, 0x04018008, 0x28180040, 0x10240014, 0x02024c00, 0x04018000, 0x281000c0, 0x00241014, 0x02024c00, 0x04018008, 0x281800c0, 0x10241014, 0x00000c00, 0x05010000, 0x20102000, 0x00000034, 0x00000c00, 0x05010008, 0x20182000, 0x10000034, 0x02000c00, 0x05010000, 0x20102080, 0x00001034, 0x02000c00, 0x05010008, 0x20182080, 0x10001034, 0x00004c00, 0x05010000, 0x20102040, 0x00040034, 0x00004c00, 0x05010008, 0x20182040, 0x10040034, 0x02004c00, 0x05010000, 0x201020c0, 0x00041034, 0x02004c00, 0x05010008, 0x201820c0, 0x10041034, 0x00020c00, 0x05018000, 0x28102000, 0x00200034, 0x00020c00, 0x05018008, 0x28182000, 0x10200034, 0x02020c00, 0x05018000, 0x28102080, 0x00201034, 0x02020c00, 0x05018008, 0x28182080, 0x10201034, 0x00024c00, 0x05018000, 0x28102040, 0x00240034, 0x00024c00, 0x05018008, 0x28182040, 0x10240034, 0x02024c00, 0x05018000, 0x281020c0, 0x00241034, 0x02024c00, 0x05018008, 0x281820c0, 0x10241034 }; /* S-box lookup tables */ static const u32 S1[64] = { 0x01010400, 0x00000000, 0x00010000, 0x01010404, 0x01010004, 0x00010404, 0x00000004, 0x00010000, 0x00000400, 0x01010400, 0x01010404, 0x00000400, 0x01000404, 0x01010004, 0x01000000, 0x00000004, 0x00000404, 0x01000400, 0x01000400, 0x00010400, 0x00010400, 0x01010000, 0x01010000, 0x01000404, 0x00010004, 0x01000004, 0x01000004, 0x00010004, 0x00000000, 0x00000404, 0x00010404, 0x01000000, 0x00010000, 0x01010404, 0x00000004, 0x01010000, 0x01010400, 0x01000000, 0x01000000, 0x00000400, 0x01010004, 0x00010000, 0x00010400, 0x01000004, 0x00000400, 0x00000004, 0x01000404, 0x00010404, 0x01010404, 0x00010004, 0x01010000, 0x01000404, 0x01000004, 0x00000404, 0x00010404, 0x01010400, 0x00000404, 0x01000400, 0x01000400, 0x00000000, 0x00010004, 0x00010400, 0x00000000, 0x01010004 }; static const u32 S2[64] = { 0x80108020, 0x80008000, 0x00008000, 0x00108020, 0x00100000, 0x00000020, 0x80100020, 0x80008020, 0x80000020, 0x80108020, 0x80108000, 0x80000000, 0x80008000, 0x00100000, 0x00000020, 0x80100020, 0x00108000, 0x00100020, 0x80008020, 0x00000000, 0x80000000, 0x00008000, 0x00108020, 0x80100000, 0x00100020, 0x80000020, 0x00000000, 0x00108000, 0x00008020, 0x80108000, 0x80100000, 0x00008020, 0x00000000, 0x00108020, 0x80100020, 0x00100000, 0x80008020, 0x80100000, 0x80108000, 0x00008000, 0x80100000, 0x80008000, 0x00000020, 0x80108020, 0x00108020, 0x00000020, 0x00008000, 0x80000000, 0x00008020, 0x80108000, 0x00100000, 0x80000020, 0x00100020, 0x80008020, 0x80000020, 0x00100020, 0x00108000, 0x00000000, 0x80008000, 0x00008020, 0x80000000, 0x80100020, 0x80108020, 0x00108000 }; static const u32 S3[64] = { 0x00000208, 0x08020200, 0x00000000, 0x08020008, 0x08000200, 0x00000000, 0x00020208, 0x08000200, 0x00020008, 0x08000008, 0x08000008, 0x00020000, 0x08020208, 0x00020008, 0x08020000, 0x00000208, 0x08000000, 0x00000008, 0x08020200, 0x00000200, 0x00020200, 0x08020000, 0x08020008, 0x00020208, 0x08000208, 0x00020200, 0x00020000, 0x08000208, 0x00000008, 0x08020208, 0x00000200, 0x08000000, 0x08020200, 0x08000000, 0x00020008, 0x00000208, 0x00020000, 0x08020200, 0x08000200, 0x00000000, 0x00000200, 0x00020008, 0x08020208, 0x08000200, 0x08000008, 0x00000200, 0x00000000, 0x08020008, 0x08000208, 0x00020000, 0x08000000, 0x08020208, 0x00000008, 0x00020208, 0x00020200, 0x08000008, 0x08020000, 0x08000208, 0x00000208, 0x08020000, 0x00020208, 0x00000008, 0x08020008, 0x00020200 }; static const u32 S4[64] = { 0x00802001, 0x00002081, 0x00002081, 0x00000080, 0x00802080, 0x00800081, 0x00800001, 0x00002001, 0x00000000, 0x00802000, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00800080, 0x00800001, 0x00000001, 0x00002000, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002001, 0x00002080, 0x00800081, 0x00000001, 0x00002080, 0x00800080, 0x00002000, 0x00802080, 0x00802081, 0x00000081, 0x00800080, 0x00800001, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00000000, 0x00802000, 0x00002080, 0x00800080, 0x00800081, 0x00000001, 0x00802001, 0x00002081, 0x00002081, 0x00000080, 0x00802081, 0x00000081, 0x00000001, 0x00002000, 0x00800001, 0x00002001, 0x00802080, 0x00800081, 0x00002001, 0x00002080, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002000, 0x00802080 }; static const u32 S5[64] = { 0x00000100, 0x02080100, 0x02080000, 0x42000100, 0x00080000, 0x00000100, 0x40000000, 0x02080000, 0x40080100, 0x00080000, 0x02000100, 0x40080100, 0x42000100, 0x42080000, 0x00080100, 0x40000000, 0x02000000, 0x40080000, 0x40080000, 0x00000000, 0x40000100, 0x42080100, 0x42080100, 0x02000100, 0x42080000, 0x40000100, 0x00000000, 0x42000000, 0x02080100, 0x02000000, 0x42000000, 0x00080100, 0x00080000, 0x42000100, 0x00000100, 0x02000000, 0x40000000, 0x02080000, 0x42000100, 0x40080100, 0x02000100, 0x40000000, 0x42080000, 0x02080100, 0x40080100, 0x00000100, 0x02000000, 0x42080000, 0x42080100, 0x00080100, 0x42000000, 0x42080100, 0x02080000, 0x00000000, 0x40080000, 0x42000000, 0x00080100, 0x02000100, 0x40000100, 0x00080000, 0x00000000, 0x40080000, 0x02080100, 0x40000100 }; static const u32 S6[64] = { 0x20000010, 0x20400000, 0x00004000, 0x20404010, 0x20400000, 0x00000010, 0x20404010, 0x00400000, 0x20004000, 0x00404010, 0x00400000, 0x20000010, 0x00400010, 0x20004000, 0x20000000, 0x00004010, 0x00000000, 0x00400010, 0x20004010, 0x00004000, 0x00404000, 0x20004010, 0x00000010, 0x20400010, 0x20400010, 0x00000000, 0x00404010, 0x20404000, 0x00004010, 0x00404000, 0x20404000, 0x20000000, 0x20004000, 0x00000010, 0x20400010, 0x00404000, 0x20404010, 0x00400000, 0x00004010, 0x20000010, 0x00400000, 0x20004000, 0x20000000, 0x00004010, 0x20000010, 0x20404010, 0x00404000, 0x20400000, 0x00404010, 0x20404000, 0x00000000, 0x20400010, 0x00000010, 0x00004000, 0x20400000, 0x00404010, 0x00004000, 0x00400010, 0x20004010, 0x00000000, 0x20404000, 0x20000000, 0x00400010, 0x20004010 }; static const u32 S7[64] = { 0x00200000, 0x04200002, 0x04000802, 0x00000000, 0x00000800, 0x04000802, 0x00200802, 0x04200800, 0x04200802, 0x00200000, 0x00000000, 0x04000002, 0x00000002, 0x04000000, 0x04200002, 0x00000802, 0x04000800, 0x00200802, 0x00200002, 0x04000800, 0x04000002, 0x04200000, 0x04200800, 0x00200002, 0x04200000, 0x00000800, 0x00000802, 0x04200802, 0x00200800, 0x00000002, 0x04000000, 0x00200800, 0x04000000, 0x00200800, 0x00200000, 0x04000802, 0x04000802, 0x04200002, 0x04200002, 0x00000002, 0x00200002, 0x04000000, 0x04000800, 0x00200000, 0x04200800, 0x00000802, 0x00200802, 0x04200800, 0x00000802, 0x04000002, 0x04200802, 0x04200000, 0x00200800, 0x00000000, 0x00000002, 0x04200802, 0x00000000, 0x00200802, 0x04200000, 0x00000800, 0x04000002, 0x04000800, 0x00000800, 0x00200002 }; static const u32 S8[64] = { 0x10001040, 0x00001000, 0x00040000, 0x10041040, 0x10000000, 0x10001040, 0x00000040, 0x10000000, 0x00040040, 0x10040000, 0x10041040, 0x00041000, 0x10041000, 0x00041040, 0x00001000, 0x00000040, 0x10040000, 0x10000040, 0x10001000, 0x00001040, 0x00041000, 0x00040040, 0x10040040, 0x10041000, 0x00001040, 0x00000000, 0x00000000, 0x10040040, 0x10000040, 0x10001000, 0x00041040, 0x00040000, 0x00041040, 0x00040000, 0x10041000, 0x00001000, 0x00000040, 0x10040040, 0x00001000, 0x00041040, 0x10001000, 0x00000040, 0x10000040, 0x10040000, 0x10040040, 0x10000000, 0x00040000, 0x10001040, 0x00000000, 0x10041040, 0x00040040, 0x10000040, 0x10040000, 0x10001000, 0x10001040, 0x00000000, 0x10041040, 0x00041000, 0x00041000, 0x00001040, 0x00001040, 0x00040040, 0x10000000, 0x10041000 }; /* Encryption components: IP, FP, and round function */ #define IP(L, R, T) \ ROL(R, 4); \ T = L; \ L ^= R; \ L &= 0xf0f0f0f0; \ R ^= L; \ L ^= T; \ ROL(R, 12); \ T = L; \ L ^= R; \ L &= 0xffff0000; \ R ^= L; \ L ^= T; \ ROR(R, 14); \ T = L; \ L ^= R; \ L &= 0xcccccccc; \ R ^= L; \ L ^= T; \ ROL(R, 6); \ T = L; \ L ^= R; \ L &= 0xff00ff00; \ R ^= L; \ L ^= T; \ ROR(R, 7); \ T = L; \ L ^= R; \ L &= 0xaaaaaaaa; \ R ^= L; \ L ^= T; \ ROL(L, 1); #define FP(L, R, T) \ ROR(L, 1); \ T = L; \ L ^= R; \ L &= 0xaaaaaaaa; \ R ^= L; \ L ^= T; \ ROL(R, 7); \ T = L; \ L ^= R; \ L &= 0xff00ff00; \ R ^= L; \ L ^= T; \ ROR(R, 6); \ T = L; \ L ^= R; \ L &= 0xcccccccc; \ R ^= L; \ L ^= T; \ ROL(R, 14); \ T = L; \ L ^= R; \ L &= 0xffff0000; \ R ^= L; \ L ^= T; \ ROR(R, 12); \ T = L; \ L ^= R; \ L &= 0xf0f0f0f0; \ R ^= L; \ L ^= T; \ ROR(R, 4); #define ROUND(L, R, A, B, K, d) \ B = K[0]; A = K[1]; K += d; \ B ^= R; A ^= R; \ B &= 0x3f3f3f3f; ROR(A, 4); \ L ^= S8[0xff & B]; A &= 0x3f3f3f3f; \ L ^= S6[0xff & (B >> 8)]; B >>= 16; \ L ^= S7[0xff & A]; \ L ^= S5[0xff & (A >> 8)]; A >>= 16; \ L ^= S4[0xff & B]; \ L ^= S2[0xff & (B >> 8)]; \ L ^= S3[0xff & A]; \ L ^= S1[0xff & (A >> 8)]; /* * PC2 lookup tables are organized as 2 consecutive sets of 4 interleaved * tables of 128 elements. One set is for C_i and the other for D_i, while * the 4 interleaved tables correspond to four 7-bit subsets of C_i or D_i. * * After PC1 each of the variables a,b,c,d contains a 7 bit subset of C_i * or D_i in bits 7-1 (bit 0 being the least significant). */ #define T1(x) pt[2 * (x) + 0] #define T2(x) pt[2 * (x) + 1] #define T3(x) pt[2 * (x) + 2] #define T4(x) pt[2 * (x) + 3] #define DES_PC2(a, b, c, d) (T4(d) | T3(c) | T2(b) | T1(a)) /* * Encryption key expansion * * RFC2451: Weak key checks SHOULD be performed. * * FIPS 74: * * Keys having duals are keys which produce all zeros, all ones, or * alternating zero-one patterns in the C and D registers after Permuted * Choice 1 has operated on the key. * */ static unsigned long des_ekey(u32 *pe, const u8 *k) { /* K&R: long is at least 32 bits */ unsigned long a, b, c, d, w; const u32 *pt = pc2; d = k[4]; d &= 0x0e; d <<= 4; d |= k[0] & 0x1e; d = pc1[d]; c = k[5]; c &= 0x0e; c <<= 4; c |= k[1] & 0x1e; c = pc1[c]; b = k[6]; b &= 0x0e; b <<= 4; b |= k[2] & 0x1e; b = pc1[b]; a = k[7]; a &= 0x0e; a <<= 4; a |= k[3] & 0x1e; a = pc1[a]; pe[15 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; pe[14 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[13 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[12 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[11 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[10 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 9 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 8 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 7 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 6 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 5 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 4 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 3 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 2 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 1 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; pe[ 0 * 2 + 0] = DES_PC2(b, c, d, a); /* Check if first half is weak */ w = (a ^ c) | (b ^ d) | (rs[a] ^ c) | (b ^ rs[d]); /* Skip to next table set */ pt += 512; d = k[0]; d &= 0xe0; d >>= 4; d |= k[4] & 0xf0; d = pc1[d + 1]; c = k[1]; c &= 0xe0; c >>= 4; c |= k[5] & 0xf0; c = pc1[c + 1]; b = k[2]; b &= 0xe0; b >>= 4; b |= k[6] & 0xf0; b = pc1[b + 1]; a = k[3]; a &= 0xe0; a >>= 4; a |= k[7] & 0xf0; a = pc1[a + 1]; /* Check if second half is weak */ w |= (a ^ c) | (b ^ d) | (rs[a] ^ c) | (b ^ rs[d]); pe[15 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; pe[14 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[13 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[12 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[11 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[10 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 9 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 8 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 7 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 6 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 5 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 4 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 3 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 2 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 1 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; pe[ 0 * 2 + 1] = DES_PC2(b, c, d, a); /* Fixup: 2413 5768 -> 1357 2468 */ for (d = 0; d < 16; ++d) { a = pe[2 * d]; b = pe[2 * d + 1]; c = a ^ b; c &= 0xffff0000; a ^= c; b ^= c; ROL(b, 18); pe[2 * d] = a; pe[2 * d + 1] = b; } /* Zero if weak key */ return w; } int des_expand_key(struct des_ctx *ctx, const u8 *key, unsigned int keylen) { if (keylen != DES_KEY_SIZE) return -EINVAL; return des_ekey(ctx->expkey, key) ? 0 : -ENOKEY; } EXPORT_SYMBOL_GPL(des_expand_key); /* * Decryption key expansion * * No weak key checking is performed, as this is only used by triple DES * */ static void dkey(u32 *pe, const u8 *k) { /* K&R: long is at least 32 bits */ unsigned long a, b, c, d; const u32 *pt = pc2; d = k[4]; d &= 0x0e; d <<= 4; d |= k[0] & 0x1e; d = pc1[d]; c = k[5]; c &= 0x0e; c <<= 4; c |= k[1] & 0x1e; c = pc1[c]; b = k[6]; b &= 0x0e; b <<= 4; b |= k[2] & 0x1e; b = pc1[b]; a = k[7]; a &= 0x0e; a <<= 4; a |= k[3] & 0x1e; a = pc1[a]; pe[ 0 * 2] = DES_PC2(a, b, c, d); d = rs[d]; pe[ 1 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 2 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 3 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 4 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 5 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 6 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 7 * 2] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 8 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 9 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[10 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[11 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[12 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[13 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[14 * 2] = DES_PC2(c, d, a, b); b = rs[b]; pe[15 * 2] = DES_PC2(b, c, d, a); /* Skip to next table set */ pt += 512; d = k[0]; d &= 0xe0; d >>= 4; d |= k[4] & 0xf0; d = pc1[d + 1]; c = k[1]; c &= 0xe0; c >>= 4; c |= k[5] & 0xf0; c = pc1[c + 1]; b = k[2]; b &= 0xe0; b >>= 4; b |= k[6] & 0xf0; b = pc1[b + 1]; a = k[3]; a &= 0xe0; a >>= 4; a |= k[7] & 0xf0; a = pc1[a + 1]; pe[ 0 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; pe[ 1 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 2 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 3 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 4 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 5 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 6 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 7 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 8 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 9 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[10 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[11 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[12 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[13 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[14 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; pe[15 * 2 + 1] = DES_PC2(b, c, d, a); /* Fixup: 2413 5768 -> 1357 2468 */ for (d = 0; d < 16; ++d) { a = pe[2 * d]; b = pe[2 * d + 1]; c = a ^ b; c &= 0xffff0000; a ^= c; b ^= c; ROL(b, 18); pe[2 * d] = a; pe[2 * d + 1] = b; } } void des_encrypt(const struct des_ctx *ctx, u8 *dst, const u8 *src) { const u32 *K = ctx->expkey; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des_encrypt); void des_decrypt(const struct des_ctx *ctx, u8 *dst, const u8 *src) { const u32 *K = ctx->expkey + DES_EXPKEY_WORDS - 2; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des_decrypt); int des3_ede_expand_key(struct des3_ede_ctx *ctx, const u8 *key, unsigned int keylen) { u32 *pe = ctx->expkey; int err; if (keylen != DES3_EDE_KEY_SIZE) return -EINVAL; err = des3_ede_verify_key(key, keylen, true); if (err && err != -ENOKEY) return err; des_ekey(pe, key); pe += DES_EXPKEY_WORDS; key += DES_KEY_SIZE; dkey(pe, key); pe += DES_EXPKEY_WORDS; key += DES_KEY_SIZE; des_ekey(pe, key); return err; } EXPORT_SYMBOL_GPL(des3_ede_expand_key); void des3_ede_encrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) { const u32 *K = dctx->expkey; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } for (i = 0; i < 8; i++) { ROUND(R, L, A, B, K, 2); ROUND(L, R, A, B, K, 2); } for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des3_ede_encrypt); void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) { const u32 *K = dctx->expkey + DES3_EDE_EXPKEY_WORDS - 2; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } for (i = 0; i < 8; i++) { ROUND(R, L, A, B, K, -2); ROUND(L, R, A, B, K, -2); } for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des3_ede_decrypt); MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); MODULE_LICENSE("GPL"); |
| 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 | // SPDX-License-Identifier: GPL-2.0 // // Register map access API // // Copyright 2011 Wolfson Microelectronics plc // // Author: Mark Brown <broonie@opensource.wolfsonmicro.com> #include <linux/device.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/err.h> #include <linux/property.h> #include <linux/rbtree.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/log2.h> #include <linux/hwspinlock.h> #include <linux/unaligned.h> #define CREATE_TRACE_POINTS #include "trace.h" #include "internal.h" /* * Sometimes for failures during very early init the trace * infrastructure isn't available early enough to be used. For this * sort of problem defining LOG_DEVICE will add printks for basic * register I/O on a specific device. */ #undef LOG_DEVICE #ifdef LOG_DEVICE static inline bool regmap_should_log(struct regmap *map) { return (map->dev && strcmp(dev_name(map->dev), LOG_DEVICE) == 0); } #else static inline bool regmap_should_log(struct regmap *map) { return false; } #endif static int _regmap_update_bits(struct regmap *map, unsigned int reg, unsigned int mask, unsigned int val, bool *change, bool force_write); static int _regmap_bus_reg_read(void *context, unsigned int reg, unsigned int *val); static int _regmap_bus_read(void *context, unsigned int reg, unsigned int *val); static int _regmap_bus_formatted_write(void *context, unsigned int reg, unsigned int val); static int _regmap_bus_reg_write(void *context, unsigned int reg, unsigned int val); static int _regmap_bus_raw_write(void *context, unsigned int reg, unsigned int val); bool regmap_reg_in_ranges(unsigned int reg, const struct regmap_range *ranges, unsigned int nranges) { const struct regmap_range *r; int i; for (i = 0, r = ranges; i < nranges; i++, r++) if (regmap_reg_in_range(reg, r)) return true; return false; } EXPORT_SYMBOL_GPL(regmap_reg_in_ranges); bool regmap_check_range_table(struct regmap *map, unsigned int reg, const struct regmap_access_table *table) { /* Check "no ranges" first */ if (regmap_reg_in_ranges(reg, table->no_ranges, table->n_no_ranges)) return false; /* In case zero "yes ranges" are supplied, any reg is OK */ if (!table->n_yes_ranges) return true; return regmap_reg_in_ranges(reg, table->yes_ranges, table->n_yes_ranges); } EXPORT_SYMBOL_GPL(regmap_check_range_table); bool regmap_writeable(struct regmap *map, unsigned int reg) { if (map->max_register_is_set && reg > map->max_register) return false; if (map->writeable_reg) return map->writeable_reg(map->dev, reg); if (map->wr_table) return regmap_check_range_table(map, reg, map->wr_table); return true; } bool regmap_cached(struct regmap *map, unsigned int reg) { int ret; unsigned int val; if (map->cache_type == REGCACHE_NONE) return false; if (!map->cache_ops) return false; if (map->max_register_is_set && reg > map->max_register) return false; map->lock(map->lock_arg); ret = regcache_read(map, reg, &val); map->unlock(map->lock_arg); if (ret) return false; return true; } bool regmap_readable(struct regmap *map, unsigned int reg) { if (!map->reg_read) return false; if (map->max_register_is_set && reg > map->max_register) return false; if (map->format.format_write) return false; if (map->readable_reg) return map->readable_reg(map->dev, reg); if (map->rd_table) return regmap_check_range_table(map, reg, map->rd_table); return true; } bool regmap_volatile(struct regmap *map, unsigned int reg) { if (!map->format.format_write && !regmap_readable(map, reg)) return false; if (map->volatile_reg) return map->volatile_reg(map->dev, reg); if (map->volatile_table) return regmap_check_range_table(map, reg, map->volatile_table); if (map->cache_ops) return false; else return true; } bool regmap_precious(struct regmap *map, unsigned int reg) { if (!regmap_readable(map, reg)) return false; if (map->precious_reg) return map->precious_reg(map->dev, reg); if (map->precious_table) return regmap_check_range_table(map, reg, map->precious_table); return false; } bool regmap_writeable_noinc(struct regmap *map, unsigned int reg) { if (map->writeable_noinc_reg) return map->writeable_noinc_reg(map->dev, reg); if (map->wr_noinc_table) return regmap_check_range_table(map, reg, map->wr_noinc_table); return true; } bool regmap_readable_noinc(struct regmap *map, unsigned int reg) { if (map->readable_noinc_reg) return map->readable_noinc_reg(map->dev, reg); if (map->rd_noinc_table) return regmap_check_range_table(map, reg, map->rd_noinc_table); return true; } static bool regmap_volatile_range(struct regmap *map, unsigned int reg, size_t num) { unsigned int i; for (i = 0; i < num; i++) if (!regmap_volatile(map, reg + regmap_get_offset(map, i))) return false; return true; } static void regmap_format_12_20_write(struct regmap *map, unsigned int reg, unsigned int val) { u8 *out = map->work_buf; out[0] = reg >> 4; out[1] = (reg << 4) | (val >> 16); out[2] = val >> 8; out[3] = val; } static void regmap_format_2_6_write(struct regmap *map, unsigned int reg, unsigned int val) { u8 *out = map->work_buf; *out = (reg << 6) | val; } static void regmap_format_4_12_write(struct regmap *map, unsigned int reg, unsigned int val) { __be16 *out = map->work_buf; *out = cpu_to_be16((reg << 12) | val); } static void regmap_format_7_9_write(struct regmap *map, unsigned int reg, unsigned int val) { __be16 *out = map->work_buf; *out = cpu_to_be16((reg << 9) | val); } static void regmap_format_7_17_write(struct regmap *map, unsigned int reg, unsigned int val) { u8 *out = map->work_buf; out[2] = val; out[1] = val >> 8; out[0] = (val >> 16) | (reg << 1); } static void regmap_format_10_14_write(struct regmap *map, unsigned int reg, unsigned int val) { u8 *out = map->work_buf; out[2] = val; out[1] = (val >> 8) | (reg << 6); out[0] = reg >> 2; } static void regmap_format_8(void *buf, unsigned int val, unsigned int shift) { u8 *b = buf; b[0] = val << shift; } static void regmap_format_16_be(void *buf, unsigned int val, unsigned int shift) { put_unaligned_be16(val << shift, buf); } static void regmap_format_16_le(void *buf, unsigned int val, unsigned int shift) { put_unaligned_le16(val << shift, buf); } static void regmap_format_16_native(void *buf, unsigned int val, unsigned int shift) { u16 v = val << shift; memcpy(buf, &v, sizeof(v)); } static void regmap_format_24_be(void *buf, unsigned int val, unsigned int shift) { put_unaligned_be24(val << shift, buf); } static void regmap_format_32_be(void *buf, unsigned int val, unsigned int shift) { put_unaligned_be32(val << shift, buf); } static void regmap_format_32_le(void *buf, unsigned int val, unsigned int shift) { put_unaligned_le32(val << shift, buf); } static void regmap_format_32_native(void *buf, unsigned int val, unsigned int shift) { u32 v = val << shift; memcpy(buf, &v, sizeof(v)); } static void regmap_parse_inplace_noop(void *buf) { } static unsigned int regmap_parse_8(const void *buf) { const u8 *b = buf; return b[0]; } static unsigned int regmap_parse_16_be(const void *buf) { return get_unaligned_be16(buf); } static unsigned int regmap_parse_16_le(const void *buf) { return get_unaligned_le16(buf); } static void regmap_parse_16_be_inplace(void *buf) { u16 v = get_unaligned_be16(buf); memcpy(buf, &v, sizeof(v)); } static void regmap_parse_16_le_inplace(void *buf) { u16 v = get_unaligned_le16(buf); memcpy(buf, &v, sizeof(v)); } static unsigned int regmap_parse_16_native(const void *buf) { u16 v; memcpy(&v, buf, sizeof(v)); return v; } static unsigned int regmap_parse_24_be(const void *buf) { return get_unaligned_be24(buf); } static unsigned int regmap_parse_32_be(const void *buf) { return get_unaligned_be32(buf); } static unsigned int regmap_parse_32_le(const void *buf) { return get_unaligned_le32(buf); } static void regmap_parse_32_be_inplace(void *buf) { u32 v = get_unaligned_be32(buf); memcpy(buf, &v, sizeof(v)); } static void regmap_parse_32_le_inplace(void *buf) { u32 v = get_unaligned_le32(buf); memcpy(buf, &v, sizeof(v)); } static unsigned int regmap_parse_32_native(const void *buf) { u32 v; memcpy(&v, buf, sizeof(v)); return v; } static void regmap_lock_hwlock(void *__map) { struct regmap *map = __map; hwspin_lock_timeout(map->hwlock, UINT_MAX); } static void regmap_lock_hwlock_irq(void *__map) { struct regmap *map = __map; hwspin_lock_timeout_irq(map->hwlock, UINT_MAX); } static void regmap_lock_hwlock_irqsave(void *__map) { struct regmap *map = __map; unsigned long flags = 0; hwspin_lock_timeout_irqsave(map->hwlock, UINT_MAX, &flags); map->spinlock_flags = flags; } static void regmap_unlock_hwlock(void *__map) { struct regmap *map = __map; hwspin_unlock(map->hwlock); } static void regmap_unlock_hwlock_irq(void *__map) { struct regmap *map = __map; hwspin_unlock_irq(map->hwlock); } static void regmap_unlock_hwlock_irqrestore(void *__map) { struct regmap *map = __map; hwspin_unlock_irqrestore(map->hwlock, &map->spinlock_flags); } static void regmap_lock_unlock_none(void *__map) { } static void regmap_lock_mutex(void *__map) { struct regmap *map = __map; mutex_lock(&map->mutex); } static void regmap_unlock_mutex(void *__map) { struct regmap *map = __map; mutex_unlock(&map->mutex); } static void regmap_lock_spinlock(void *__map) __acquires(&map->spinlock) { struct regmap *map = __map; unsigned long flags; spin_lock_irqsave(&map->spinlock, flags); map->spinlock_flags = flags; } static void regmap_unlock_spinlock(void *__map) __releases(&map->spinlock) { struct regmap *map = __map; spin_unlock_irqrestore(&map->spinlock, map->spinlock_flags); } static void regmap_lock_raw_spinlock(void *__map) __acquires(&map->raw_spinlock) { struct regmap *map = __map; unsigned long flags; raw_spin_lock_irqsave(&map->raw_spinlock, flags); map->raw_spinlock_flags = flags; } static void regmap_unlock_raw_spinlock(void *__map) __releases(&map->raw_spinlock) { struct regmap *map = __map; raw_spin_unlock_irqrestore(&map->raw_spinlock, map->raw_spinlock_flags); } static void dev_get_regmap_release(struct device *dev, void *res) { /* * We don't actually have anything to do here; the goal here * is not to manage the regmap but to provide a simple way to * get the regmap back given a struct device. */ } static bool _regmap_range_add(struct regmap *map, struct regmap_range_node *data) { struct rb_root *root = &map->range_tree; struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { struct regmap_range_node *this = rb_entry(*new, struct regmap_range_node, node); parent = *new; if (data->range_max < this->range_min) new = &((*new)->rb_left); else if (data->range_min > this->range_max) new = &((*new)->rb_right); else return false; } rb_link_node(&data->node, parent, new); rb_insert_color(&data->node, root); return true; } static struct regmap_range_node *_regmap_range_lookup(struct regmap *map, unsigned int reg) { struct rb_node *node = map->range_tree.rb_node; while (node) { struct regmap_range_node *this = rb_entry(node, struct regmap_range_node, node); if (reg < this->range_min) node = node->rb_left; else if (reg > this->range_max) node = node->rb_right; else return this; } return NULL; } static void regmap_range_exit(struct regmap *map) { struct rb_node *next; struct regmap_range_node *range_node; next = rb_first(&map->range_tree); while (next) { range_node = rb_entry(next, struct regmap_range_node, node); next = rb_next(&range_node->node); rb_erase(&range_node->node, &map->range_tree); kfree(range_node); } kfree(map->selector_work_buf); } static int regmap_set_name(struct regmap *map, const struct regmap_config *config) { if (config->name) { const char *name = kstrdup_const(config->name, GFP_KERNEL); if (!name) return -ENOMEM; kfree_const(map->name); map->name = name; } return 0; } int regmap_attach_dev(struct device *dev, struct regmap *map, const struct regmap_config *config) { struct regmap **m; int ret; map->dev = dev; ret = regmap_set_name(map, config); if (ret) return ret; regmap_debugfs_exit(map); regmap_debugfs_init(map); /* Add a devres resource for dev_get_regmap() */ m = devres_alloc(dev_get_regmap_release, sizeof(*m), GFP_KERNEL); if (!m) { regmap_debugfs_exit(map); return -ENOMEM; } *m = map; devres_add(dev, m); return 0; } EXPORT_SYMBOL_GPL(regmap_attach_dev); static int dev_get_regmap_match(struct device *dev, void *res, void *data); static int regmap_detach_dev(struct device *dev, struct regmap *map) { if (!dev) return 0; return devres_release(dev, dev_get_regmap_release, dev_get_regmap_match, (void *)map->name); } static enum regmap_endian regmap_get_reg_endian(const struct regmap_bus *bus, const struct regmap_config *config) { enum regmap_endian endian; /* Retrieve the endianness specification from the regmap config */ endian = config->reg_format_endian; /* If the regmap config specified a non-default value, use that */ if (endian != REGMAP_ENDIAN_DEFAULT) return endian; /* Retrieve the endianness specification from the bus config */ if (bus && bus->reg_format_endian_default) endian = bus->reg_format_endian_default; /* If the bus specified a non-default value, use that */ if (endian != REGMAP_ENDIAN_DEFAULT) return endian; /* Use this if no other value was found */ return REGMAP_ENDIAN_BIG; } enum regmap_endian regmap_get_val_endian(struct device *dev, const struct regmap_bus *bus, const struct regmap_config *config) { struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; enum regmap_endian endian; /* Retrieve the endianness specification from the regmap config */ endian = config->val_format_endian; /* If the regmap config specified a non-default value, use that */ if (endian != REGMAP_ENDIAN_DEFAULT) return endian; /* If the firmware node exist try to get endianness from it */ if (fwnode_property_read_bool(fwnode, "big-endian")) endian = REGMAP_ENDIAN_BIG; else if (fwnode_property_read_bool(fwnode, "little-endian")) endian = REGMAP_ENDIAN_LITTLE; else if (fwnode_property_read_bool(fwnode, "native-endian")) endian = REGMAP_ENDIAN_NATIVE; /* If the endianness was specified in fwnode, use that */ if (endian != REGMAP_ENDIAN_DEFAULT) return endian; /* Retrieve the endianness specification from the bus config */ if (bus && bus->val_format_endian_default) endian = bus->val_format_endian_default; /* If the bus specified a non-default value, use that */ if (endian != REGMAP_ENDIAN_DEFAULT) return endian; /* Use this if no other value was found */ return REGMAP_ENDIAN_BIG; } EXPORT_SYMBOL_GPL(regmap_get_val_endian); struct regmap *__regmap_init(struct device *dev, const struct regmap_bus *bus, void *bus_context, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name) { struct regmap *map; int ret = -EINVAL; enum regmap_endian reg_endian, val_endian; int i, j; if (!config) goto err; map = kzalloc_obj(*map); if (map == NULL) { ret = -ENOMEM; goto err; } ret = regmap_set_name(map, config); if (ret) goto err_map; ret = -EINVAL; /* Later error paths rely on this */ if (config->disable_locking) { map->lock = map->unlock = regmap_lock_unlock_none; map->can_sleep = config->can_sleep; regmap_debugfs_disable(map); } else if (config->lock && config->unlock) { map->lock = config->lock; map->unlock = config->unlock; map->lock_arg = config->lock_arg; map->can_sleep = config->can_sleep; } else if (config->use_hwlock) { map->hwlock = hwspin_lock_request_specific(config->hwlock_id); if (!map->hwlock) { ret = -ENXIO; goto err_name; } switch (config->hwlock_mode) { case HWLOCK_IRQSTATE: map->lock = regmap_lock_hwlock_irqsave; map->unlock = regmap_unlock_hwlock_irqrestore; break; case HWLOCK_IRQ: map->lock = regmap_lock_hwlock_irq; map->unlock = regmap_unlock_hwlock_irq; break; default: map->lock = regmap_lock_hwlock; map->unlock = regmap_unlock_hwlock; break; } map->lock_arg = map; } else { if ((bus && bus->fast_io) || config->fast_io) { if (config->use_raw_spinlock) { raw_spin_lock_init(&map->raw_spinlock); map->lock = regmap_lock_raw_spinlock; map->unlock = regmap_unlock_raw_spinlock; lockdep_set_class_and_name(&map->raw_spinlock, lock_key, lock_name); } else { spin_lock_init(&map->spinlock); map->lock = regmap_lock_spinlock; map->unlock = regmap_unlock_spinlock; lockdep_set_class_and_name(&map->spinlock, lock_key, lock_name); } } else { mutex_init(&map->mutex); map->lock = regmap_lock_mutex; map->unlock = regmap_unlock_mutex; map->can_sleep = true; lockdep_set_class_and_name(&map->mutex, lock_key, lock_name); } map->lock_arg = map; map->lock_key = lock_key; } /* * When we write in fast-paths with regmap_bulk_write() don't allocate * scratch buffers with sleeping allocations. */ if ((bus && bus->fast_io) || config->fast_io) map->alloc_flags = GFP_ATOMIC; else map->alloc_flags = GFP_KERNEL; map->reg_base = config->reg_base; map->reg_shift = config->pad_bits % 8; map->format.pad_bytes = config->pad_bits / 8; map->format.reg_shift = config->reg_shift; map->format.reg_bytes = BITS_TO_BYTES(config->reg_bits); map->format.val_bytes = BITS_TO_BYTES(config->val_bits); map->format.buf_size = BITS_TO_BYTES(config->reg_bits + config->val_bits + config->pad_bits); if (config->reg_stride) map->reg_stride = config->reg_stride; else map->reg_stride = 1; if (is_power_of_2(map->reg_stride)) map->reg_stride_order = ilog2(map->reg_stride); else map->reg_stride_order = -1; map->use_single_read = config->use_single_read || !(config->read || (bus && bus->read)); map->use_single_write = config->use_single_write || !(config->write || (bus && bus->write)); map->can_multi_write = config->can_multi_write && (config->write || (bus && bus->write)); if (bus) { map->max_raw_read = bus->max_raw_read; map->max_raw_write = bus->max_raw_write; } else if (config->max_raw_read && config->max_raw_write) { map->max_raw_read = config->max_raw_read; map->max_raw_write = config->max_raw_write; } map->dev = dev; map->bus = bus; map->bus_context = bus_context; map->max_register = config->max_register; map->max_register_is_set = map->max_register ?: config->max_register_is_0; map->wr_table = config->wr_table; map->rd_table = config->rd_table; map->volatile_table = config->volatile_table; map->precious_table = config->precious_table; map->wr_noinc_table = config->wr_noinc_table; map->rd_noinc_table = config->rd_noinc_table; map->writeable_reg = config->writeable_reg; map->readable_reg = config->readable_reg; map->volatile_reg = config->volatile_reg; map->precious_reg = config->precious_reg; map->writeable_noinc_reg = config->writeable_noinc_reg; map->readable_noinc_reg = config->readable_noinc_reg; map->reg_default_cb = config->reg_default_cb; map->cache_type = config->cache_type; spin_lock_init(&map->async_lock); INIT_LIST_HEAD(&map->async_list); INIT_LIST_HEAD(&map->async_free); init_waitqueue_head(&map->async_waitq); if (config->read_flag_mask || config->write_flag_mask || config->zero_flag_mask) { map->read_flag_mask = config->read_flag_mask; map->write_flag_mask = config->write_flag_mask; } else if (bus) { map->read_flag_mask = bus->read_flag_mask; } if (config->read && config->write) { map->reg_read = _regmap_bus_read; if (config->reg_update_bits) map->reg_update_bits = config->reg_update_bits; /* Bulk read/write */ map->read = config->read; map->write = config->write; reg_endian = REGMAP_ENDIAN_NATIVE; val_endian = REGMAP_ENDIAN_NATIVE; } else if (!bus) { map->reg_read = config->reg_read; map->reg_write = config->reg_write; map->reg_update_bits = config->reg_update_bits; map->defer_caching = false; goto skip_format_initialization; } else if (!bus->read || !bus->write) { map->reg_read = _regmap_bus_reg_read; map->reg_write = _regmap_bus_reg_write; map->reg_update_bits = bus->reg_update_bits; map->defer_caching = false; goto skip_format_initialization; } else { map->reg_read = _regmap_bus_read; map->reg_update_bits = bus->reg_update_bits; /* Bulk read/write */ map->read = bus->read; map->write = bus->write; reg_endian = regmap_get_reg_endian(bus, config); val_endian = regmap_get_val_endian(dev, bus, config); } switch (config->reg_bits + map->reg_shift) { case 2: switch (config->val_bits) { case 6: map->format.format_write = regmap_format_2_6_write; break; default: goto err_hwlock; } break; case 4: switch (config->val_bits) { case 12: map->format.format_write = regmap_format_4_12_write; break; default: goto err_hwlock; } break; case 7: switch (config->val_bits) { case 9: map->format.format_write = regmap_format_7_9_write; break; case 17: map->format.format_write = regmap_format_7_17_write; break; default: goto err_hwlock; } break; case 10: switch (config->val_bits) { case 14: map->format.format_write = regmap_format_10_14_write; break; default: goto err_hwlock; } break; case 12: switch (config->val_bits) { case 20: map->format.format_write = regmap_format_12_20_write; break; default: goto err_hwlock; } break; case 8: map->format.format_reg = regmap_format_8; break; case 16: switch (reg_endian) { case REGMAP_ENDIAN_BIG: map->format.format_reg = regmap_format_16_be; break; case REGMAP_ENDIAN_LITTLE: map->format.format_reg = regmap_format_16_le; break; case REGMAP_ENDIAN_NATIVE: map->format.format_reg = regmap_format_16_native; break; default: goto err_hwlock; } break; case 24: switch (reg_endian) { case REGMAP_ENDIAN_BIG: map->format.format_reg = regmap_format_24_be; break; default: goto err_hwlock; } break; case 32: switch (reg_endian) { case REGMAP_ENDIAN_BIG: map->format.format_reg = regmap_format_32_be; break; case REGMAP_ENDIAN_LITTLE: map->format.format_reg = regmap_format_32_le; break; case REGMAP_ENDIAN_NATIVE: map->format.format_reg = regmap_format_32_native; break; default: goto err_hwlock; } break; default: goto err_hwlock; } if (val_endian == REGMAP_ENDIAN_NATIVE) map->format.parse_inplace = regmap_parse_inplace_noop; switch (config->val_bits) { case 8: map->format.format_val = regmap_format_8; map->format.parse_val = regmap_parse_8; map->format.parse_inplace = regmap_parse_inplace_noop; break; case 16: switch (val_endian) { case REGMAP_ENDIAN_BIG: map->format.format_val = regmap_format_16_be; map->format.parse_val = regmap_parse_16_be; map->format.parse_inplace = regmap_parse_16_be_inplace; break; case REGMAP_ENDIAN_LITTLE: map->format.format_val = regmap_format_16_le; map->format.parse_val = regmap_parse_16_le; map->format.parse_inplace = regmap_parse_16_le_inplace; break; case REGMAP_ENDIAN_NATIVE: map->format.format_val = regmap_format_16_native; map->format.parse_val = regmap_parse_16_native; break; default: goto err_hwlock; } break; case 24: switch (val_endian) { case REGMAP_ENDIAN_BIG: map->format.format_val = regmap_format_24_be; map->format.parse_val = regmap_parse_24_be; break; default: goto err_hwlock; } break; case 32: switch (val_endian) { case REGMAP_ENDIAN_BIG: map->format.format_val = regmap_format_32_be; map->format.parse_val = regmap_parse_32_be; map->format.parse_inplace = regmap_parse_32_be_inplace; break; case REGMAP_ENDIAN_LITTLE: map->format.format_val = regmap_format_32_le; map->format.parse_val = regmap_parse_32_le; map->format.parse_inplace = regmap_parse_32_le_inplace; break; case REGMAP_ENDIAN_NATIVE: map->format.format_val = regmap_format_32_native; map->format.parse_val = regmap_parse_32_native; break; default: goto err_hwlock; } break; } if (map->format.format_write) { if ((reg_endian != REGMAP_ENDIAN_BIG) || (val_endian != REGMAP_ENDIAN_BIG)) goto err_hwlock; map->use_single_write = true; } if (!map->format.format_write && !(map->format.format_reg && map->format.format_val)) goto err_hwlock; map->work_buf = kzalloc(map->format.buf_size, GFP_KERNEL); if (map->work_buf == NULL) { ret = -ENOMEM; goto err_hwlock; } if (map->format.format_write) { map->defer_caching = false; map->reg_write = _regmap_bus_formatted_write; } else if (map->format.format_val) { map->defer_caching = true; map->reg_write = _regmap_bus_raw_write; } skip_format_initialization: map->range_tree = RB_ROOT; for (i = 0; i < config->num_ranges; i++) { const struct regmap_range_cfg *range_cfg = &config->ranges[i]; struct regmap_range_node *new; /* Sanity check */ if (range_cfg->range_max < range_cfg->range_min) { dev_err(map->dev, "Invalid range %d: %u < %u\n", i, range_cfg->range_max, range_cfg->range_min); goto err_range; } if (range_cfg->range_max > map->max_register) { dev_err(map->dev, "Invalid range %d: %u > %u\n", i, range_cfg->range_max, map->max_register); goto err_range; } if (range_cfg->selector_reg > map->max_register) { dev_err(map->dev, "Invalid range %d: selector out of map\n", i); goto err_range; } if (range_cfg->window_len == 0) { dev_err(map->dev, "Invalid range %d: window_len 0\n", i); goto err_range; } /* Make sure, that this register range has no selector or data window within its boundary */ for (j = 0; j < config->num_ranges; j++) { unsigned int sel_reg = config->ranges[j].selector_reg; unsigned int win_min = config->ranges[j].window_start; unsigned int win_max = win_min + config->ranges[j].window_len - 1; /* Allow data window inside its own virtual range */ if (j == i) continue; if (range_cfg->range_min <= sel_reg && sel_reg <= range_cfg->range_max) { dev_err(map->dev, "Range %d: selector for %d in window\n", i, j); goto err_range; } if (!(win_max < range_cfg->range_min || win_min > range_cfg->range_max)) { dev_err(map->dev, "Range %d: window for %d in window\n", i, j); goto err_range; } } new = kzalloc_obj(*new); if (new == NULL) { ret = -ENOMEM; goto err_range; } new->map = map; new->name = range_cfg->name; new->range_min = range_cfg->range_min; new->range_max = range_cfg->range_max; new->selector_reg = range_cfg->selector_reg; new->selector_mask = range_cfg->selector_mask; new->selector_shift = range_cfg->selector_shift; new->window_start = range_cfg->window_start; new->window_len = range_cfg->window_len; if (!_regmap_range_add(map, new)) { dev_err(map->dev, "Failed to add range %d\n", i); kfree(new); goto err_range; } if (map->selector_work_buf == NULL) { map->selector_work_buf = kzalloc(map->format.buf_size, GFP_KERNEL); if (map->selector_work_buf == NULL) { ret = -ENOMEM; goto err_range; } } } ret = regcache_init(map, config); if (ret != 0) goto err_range; if (dev) { ret = regmap_attach_dev(dev, map, config); if (ret != 0) goto err_regcache; } else { regmap_debugfs_init(map); } return map; err_regcache: regcache_exit(map); err_range: regmap_range_exit(map); kfree(map->work_buf); err_hwlock: if (map->hwlock) hwspin_lock_free(map->hwlock); err_name: kfree_const(map->name); err_map: kfree(map); err: if (bus && bus->free_on_exit) kfree(bus); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(__regmap_init); static void devm_regmap_release(struct device *dev, void *res) { regmap_exit(*(struct regmap **)res); } struct regmap *__devm_regmap_init(struct device *dev, const struct regmap_bus *bus, void *bus_context, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name) { struct regmap **ptr, *regmap; ptr = devres_alloc(devm_regmap_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); regmap = __regmap_init(dev, bus, bus_context, config, lock_key, lock_name); if (!IS_ERR(regmap)) { *ptr = regmap; devres_add(dev, ptr); } else { devres_free(ptr); } return regmap; } EXPORT_SYMBOL_GPL(__devm_regmap_init); static void regmap_field_init(struct regmap_field *rm_field, struct regmap *regmap, struct reg_field reg_field) { rm_field->regmap = regmap; rm_field->reg = reg_field.reg; rm_field->shift = reg_field.lsb; rm_field->mask = GENMASK(reg_field.msb, reg_field.lsb); WARN_ONCE(rm_field->mask == 0, "invalid empty mask defined\n"); rm_field->id_size = reg_field.id_size; rm_field->id_offset = reg_field.id_offset; } /** * devm_regmap_field_alloc() - Allocate and initialise a register field. * * @dev: Device that will be interacted with * @regmap: regmap bank in which this register field is located. * @reg_field: Register field with in the bank. * * The return value will be an ERR_PTR() on error or a valid pointer * to a struct regmap_field. The regmap_field will be automatically freed * by the device management code. */ struct regmap_field *devm_regmap_field_alloc(struct device *dev, struct regmap *regmap, struct reg_field reg_field) { struct regmap_field *rm_field = devm_kzalloc(dev, sizeof(*rm_field), GFP_KERNEL); if (!rm_field) return ERR_PTR(-ENOMEM); regmap_field_init(rm_field, regmap, reg_field); return rm_field; } EXPORT_SYMBOL_GPL(devm_regmap_field_alloc); /** * regmap_field_bulk_alloc() - Allocate and initialise a bulk register field. * * @regmap: regmap bank in which this register field is located. * @rm_field: regmap register fields within the bank. * @reg_field: Register fields within the bank. * @num_fields: Number of register fields. * * The return value will be an -ENOMEM on error or zero for success. * Newly allocated regmap_fields should be freed by calling * regmap_field_bulk_free() */ int regmap_field_bulk_alloc(struct regmap *regmap, struct regmap_field **rm_field, const struct reg_field *reg_field, int num_fields) { struct regmap_field *rf; int i; rf = kzalloc_objs(*rf, num_fields); if (!rf) return -ENOMEM; for (i = 0; i < num_fields; i++) { regmap_field_init(&rf[i], regmap, reg_field[i]); rm_field[i] = &rf[i]; } return 0; } EXPORT_SYMBOL_GPL(regmap_field_bulk_alloc); /** * devm_regmap_field_bulk_alloc() - Allocate and initialise a bulk register * fields. * * @dev: Device that will be interacted with * @regmap: regmap bank in which this register field is located. * @rm_field: regmap register fields within the bank. * @reg_field: Register fields within the bank. * @num_fields: Number of register fields. * * The return value will be an -ENOMEM on error or zero for success. * Newly allocated regmap_fields will be automatically freed by the * device management code. */ int devm_regmap_field_bulk_alloc(struct device *dev, struct regmap *regmap, struct regmap_field **rm_field, const struct reg_field *reg_field, int num_fields) { struct regmap_field *rf; int i; rf = devm_kcalloc(dev, num_fields, sizeof(*rf), GFP_KERNEL); if (!rf) return -ENOMEM; for (i = 0; i < num_fields; i++) { regmap_field_init(&rf[i], regmap, reg_field[i]); rm_field[i] = &rf[i]; } return 0; } EXPORT_SYMBOL_GPL(devm_regmap_field_bulk_alloc); /** * regmap_field_bulk_free() - Free register field allocated using * regmap_field_bulk_alloc. * * @field: regmap fields which should be freed. */ void regmap_field_bulk_free(struct regmap_field *field) { kfree(field); } EXPORT_SYMBOL_GPL(regmap_field_bulk_free); /** * devm_regmap_field_bulk_free() - Free a bulk register field allocated using * devm_regmap_field_bulk_alloc. * * @dev: Device that will be interacted with * @field: regmap field which should be freed. * * Free register field allocated using devm_regmap_field_bulk_alloc(). Usually * drivers need not call this function, as the memory allocated via devm * will be freed as per device-driver life-cycle. */ void devm_regmap_field_bulk_free(struct device *dev, struct regmap_field *field) { devm_kfree(dev, field); } EXPORT_SYMBOL_GPL(devm_regmap_field_bulk_free); /** * devm_regmap_field_free() - Free a register field allocated using * devm_regmap_field_alloc. * * @dev: Device that will be interacted with * @field: regmap field which should be freed. * * Free register field allocated using devm_regmap_field_alloc(). Usually * drivers need not call this function, as the memory allocated via devm * will be freed as per device-driver life-cyle. */ void devm_regmap_field_free(struct device *dev, struct regmap_field *field) { devm_kfree(dev, field); } EXPORT_SYMBOL_GPL(devm_regmap_field_free); /** * regmap_field_alloc() - Allocate and initialise a register field. * * @regmap: regmap bank in which this register field is located. * @reg_field: Register field with in the bank. * * The return value will be an ERR_PTR() on error or a valid pointer * to a struct regmap_field. The regmap_field should be freed by the * user once its finished working with it using regmap_field_free(). */ struct regmap_field *regmap_field_alloc(struct regmap *regmap, struct reg_field reg_field) { struct regmap_field *rm_field = kzalloc_obj(*rm_field); if (!rm_field) return ERR_PTR(-ENOMEM); regmap_field_init(rm_field, regmap, reg_field); return rm_field; } EXPORT_SYMBOL_GPL(regmap_field_alloc); /** * regmap_field_free() - Free register field allocated using * regmap_field_alloc. * * @field: regmap field which should be freed. */ void regmap_field_free(struct regmap_field *field) { kfree(field); } EXPORT_SYMBOL_GPL(regmap_field_free); /** * regmap_reinit_cache() - Reinitialise the current register cache * * @map: Register map to operate on. * @config: New configuration. Only the cache data will be used. * * Discard any existing register cache for the map and initialize a * new cache. This can be used to restore the cache to defaults or to * update the cache configuration to reflect runtime discovery of the * hardware. * * No explicit locking is done here, the user needs to ensure that * this function will not race with other calls to regmap. */ int regmap_reinit_cache(struct regmap *map, const struct regmap_config *config) { int ret; regcache_exit(map); regmap_debugfs_exit(map); map->max_register = config->max_register; map->max_register_is_set = map->max_register ?: config->max_register_is_0; map->writeable_reg = config->writeable_reg; map->readable_reg = config->readable_reg; map->volatile_reg = config->volatile_reg; map->precious_reg = config->precious_reg; map->writeable_noinc_reg = config->writeable_noinc_reg; map->readable_noinc_reg = config->readable_noinc_reg; map->reg_default_cb = config->reg_default_cb; map->cache_type = config->cache_type; ret = regmap_set_name(map, config); if (ret) return ret; regmap_debugfs_init(map); map->cache_bypass = false; map->cache_only = false; return regcache_init(map, config); } EXPORT_SYMBOL_GPL(regmap_reinit_cache); /** * regmap_exit() - Free a previously allocated register map * * @map: Register map to operate on. */ void regmap_exit(struct regmap *map) { struct regmap_async *async; regmap_detach_dev(map->dev, map); regcache_exit(map); regmap_debugfs_exit(map); regmap_range_exit(map); if (map->bus && map->bus->free_context) map->bus->free_context(map->bus_context); kfree(map->work_buf); while (!list_empty(&map->async_free)) { async = list_first_entry_or_null(&map->async_free, struct regmap_async, list); list_del(&async->list); kfree(async->work_buf); kfree(async); } if (map->hwlock) hwspin_lock_free(map->hwlock); if (map->lock == regmap_lock_mutex) mutex_destroy(&map->mutex); kfree_const(map->name); kfree(map->patch); if (map->bus && map->bus->free_on_exit) kfree(map->bus); kfree(map); } EXPORT_SYMBOL_GPL(regmap_exit); static int dev_get_regmap_match(struct device *dev, void *res, void *data) { struct regmap **r = res; if (!r || !*r) { WARN_ON(!r || !*r); return 0; } /* If the user didn't specify a name match any */ if (data) return (*r)->name && !strcmp((*r)->name, data); else return 1; } /** * dev_get_regmap() - Obtain the regmap (if any) for a device * * @dev: Device to retrieve the map for * @name: Optional name for the register map, usually NULL. * * Returns the regmap for the device if one is present, or NULL. If * name is specified then it must match the name specified when * registering the device, if it is NULL then the first regmap found * will be used. Devices with multiple register maps are very rare, * generic code should normally not need to specify a name. */ struct regmap *dev_get_regmap(struct device *dev, const char *name) { struct regmap **r = devres_find(dev, dev_get_regmap_release, dev_get_regmap_match, (void *)name); if (!r) return NULL; return *r; } EXPORT_SYMBOL_GPL(dev_get_regmap); /** * regmap_get_device() - Obtain the device from a regmap * * @map: Register map to operate on. * * Returns the underlying device that the regmap has been created for. */ struct device *regmap_get_device(struct regmap *map) { return map->dev; } EXPORT_SYMBOL_GPL(regmap_get_device); static int _regmap_select_page(struct regmap *map, unsigned int *reg, struct regmap_range_node *range, unsigned int val_num) { void *orig_work_buf; unsigned int win_offset; unsigned int win_page; bool page_chg; int ret; win_offset = (*reg - range->range_min) % range->window_len; win_page = (*reg - range->range_min) / range->window_len; if (val_num > 1) { /* Bulk write shouldn't cross range boundary */ if (*reg + val_num - 1 > range->range_max) return -EINVAL; /* ... or single page boundary */ if (val_num > range->window_len - win_offset) return -EINVAL; } /* It is possible to have selector register inside data window. In that case, selector register is located on every page and it needs no page switching, when accessed alone. */ if (val_num > 1 || range->window_start + win_offset != range->selector_reg) { /* Use separate work_buf during page switching */ orig_work_buf = map->work_buf; map->work_buf = map->selector_work_buf; ret = _regmap_update_bits(map, range->selector_reg, range->selector_mask, win_page << range->selector_shift, &page_chg, false); map->work_buf = orig_work_buf; if (ret != 0) return ret; } *reg = range->window_start + win_offset; return 0; } static void regmap_set_work_buf_flag_mask(struct regmap *map, int max_bytes, unsigned long mask) { u8 *buf; int i; if (!mask || !map->work_buf) return; buf = map->work_buf; for (i = 0; i < max_bytes; i++) buf[i] |= (mask >> (8 * i)) & 0xff; } static unsigned int regmap_reg_addr(struct regmap *map, unsigned int reg) { reg += map->reg_base; if (map->format.reg_shift > 0) reg >>= map->format.reg_shift; else if (map->format.reg_shift < 0) reg <<= -(map->format.reg_shift); return reg; } static int _regmap_raw_write_impl(struct regmap *map, unsigned int reg, const void *val, size_t val_len, bool noinc) { struct regmap_range_node *range; unsigned long flags; void *work_val = map->work_buf + map->format.reg_bytes + map->format.pad_bytes; void *buf; int ret = -ENOTSUPP; size_t len; int i; /* Check for unwritable or noinc registers in range * before we start */ if (!regmap_writeable_noinc(map, reg)) { for (i = 0; i < val_len / map->format.val_bytes; i++) { unsigned int element = reg + regmap_get_offset(map, i); if (!regmap_writeable(map, element) || regmap_writeable_noinc(map, element)) return -EINVAL; } } if (!map->cache_bypass && map->format.parse_val) { unsigned int ival, offset; int val_bytes = map->format.val_bytes; /* Cache the last written value for noinc writes */ i = noinc ? val_len - val_bytes : 0; for (; i < val_len; i += val_bytes) { ival = map->format.parse_val(val + i); offset = noinc ? 0 : regmap_get_offset(map, i / val_bytes); ret = regcache_write(map, reg + offset, ival); if (ret) { dev_err(map->dev, "Error in caching of register: %x ret: %d\n", reg + offset, ret); return ret; } } if (map->cache_only) { map->cache_dirty = true; return 0; } } range = _regmap_range_lookup(map, reg); if (range) { int val_num = val_len / map->format.val_bytes; int win_offset = (reg - range->range_min) % range->window_len; int win_residue = range->window_len - win_offset; /* If the write goes beyond the end of the window split it */ while (val_num > win_residue) { dev_dbg(map->dev, "Writing window %d/%zu\n", win_residue, val_len / map->format.val_bytes); ret = _regmap_raw_write_impl(map, reg, val, win_residue * map->format.val_bytes, noinc); if (ret != 0) return ret; reg += win_residue; val_num -= win_residue; val += win_residue * map->format.val_bytes; val_len -= win_residue * map->format.val_bytes; win_offset = (reg - range->range_min) % range->window_len; win_residue = range->window_len - win_offset; } ret = _regmap_select_page(map, ®, range, noinc ? 1 : val_num); if (ret != 0) return ret; } reg = regmap_reg_addr(map, reg); map->format.format_reg(map->work_buf, reg, map->reg_shift); regmap_set_work_buf_flag_mask(map, map->format.reg_bytes, map->write_flag_mask); /* * Essentially all I/O mechanisms will be faster with a single * buffer to write. Since register syncs often generate raw * writes of single registers optimise that case. */ if (val != work_val && val_len == map->format.val_bytes) { memcpy(work_val, val, map->format.val_bytes); val = work_val; } if (map->async && map->bus && map->bus->async_write) { struct regmap_async *async; trace_regmap_async_write_start(map, reg, val_len); spin_lock_irqsave(&map->async_lock, flags); async = list_first_entry_or_null(&map->async_free, struct regmap_async, list); if (async) list_del(&async->list); spin_unlock_irqrestore(&map->async_lock, flags); if (!async) { async = map->bus->async_alloc(); if (!async) return -ENOMEM; async->work_buf = kzalloc(map->format.buf_size, GFP_KERNEL | GFP_DMA); if (!async->work_buf) { kfree(async); return -ENOMEM; } } async->map = map; /* If the caller supplied the value we can use it safely. */ memcpy(async->work_buf, map->work_buf, map->format.pad_bytes + map->format.reg_bytes + map->format.val_bytes); spin_lock_irqsave(&map->async_lock, flags); list_add_tail(&async->list, &map->async_list); spin_unlock_irqrestore(&map->async_lock, flags); if (val != work_val) ret = map->bus->async_write(map->bus_context, async->work_buf, map->format.reg_bytes + map->format.pad_bytes, val, val_len, async); else ret = map->bus->async_write(map->bus_context, async->work_buf, map->format.reg_bytes + map->format.pad_bytes + val_len, NULL, 0, async); if (ret != 0) { dev_err(map->dev, "Failed to schedule write: %d\n", ret); spin_lock_irqsave(&map->async_lock, flags); list_move(&async->list, &map->async_free); spin_unlock_irqrestore(&map->async_lock, flags); } return ret; } trace_regmap_hw_write_start(map, reg, val_len / map->format.val_bytes); /* If we're doing a single register write we can probably just * send the work_buf directly, otherwise try to do a gather * write. */ if (val == work_val) ret = map->write(map->bus_context, map->work_buf, map->format.reg_bytes + map->format.pad_bytes + val_len); else if (map->bus && map->bus->gather_write) ret = map->bus->gather_write(map->bus_context, map->work_buf, map->format.reg_bytes + map->format.pad_bytes, val, val_len); else ret = -ENOTSUPP; /* If that didn't work fall back on linearising by hand. */ if (ret == -ENOTSUPP) { len = map->format.reg_bytes + map->format.pad_bytes + val_len; buf = kzalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, map->work_buf, map->format.reg_bytes); memcpy(buf + map->format.reg_bytes + map->format.pad_bytes, val, val_len); ret = map->write(map->bus_context, buf, len); kfree(buf); } else if (ret != 0 && !map->cache_bypass && map->format.parse_val) { /* regcache_drop_region() takes lock that we already have, * thus call map->cache_ops->drop() directly */ if (map->cache_ops && map->cache_ops->drop) map->cache_ops->drop(map, reg, reg + 1); } trace_regmap_hw_write_done(map, reg, val_len / map->format.val_bytes); return ret; } /** * regmap_can_raw_write - Test if regmap_raw_write() is supported * * @map: Map to check. */ bool regmap_can_raw_write(struct regmap *map) { return map->write && map->format.format_val && map->format.format_reg; } EXPORT_SYMBOL_GPL(regmap_can_raw_write); /** * regmap_get_raw_read_max - Get the maximum size we can read * * @map: Map to check. */ size_t regmap_get_raw_read_max(struct regmap *map) { return map->max_raw_read; } EXPORT_SYMBOL_GPL(regmap_get_raw_read_max); /** * regmap_get_raw_write_max - Get the maximum size we can read * * @map: Map to check. */ size_t regmap_get_raw_write_max(struct regmap *map) { return map->max_raw_write; } EXPORT_SYMBOL_GPL(regmap_get_raw_write_max); static int _regmap_bus_formatted_write(void *context, unsigned int reg, unsigned int val) { int ret; struct regmap_range_node *range; struct regmap *map = context; WARN_ON(!map->format.format_write); range = _regmap_range_lookup(map, reg); if (range) { ret = _regmap_select_page(map, ®, range, 1); if (ret != 0) return ret; } reg = regmap_reg_addr(map, reg); map->format.format_write(map, reg, val); trace_regmap_hw_write_start(map, reg, 1); ret = map->write(map->bus_context, map->work_buf, map->format.buf_size); trace_regmap_hw_write_done(map, reg, 1); return ret; } static int _regmap_bus_reg_write(void *context, unsigned int reg, unsigned int val) { struct regmap *map = context; struct regmap_range_node *range; int ret; range = _regmap_range_lookup(map, reg); if (range) { ret = _regmap_select_page(map, ®, range, 1); if (ret != 0) return ret; } reg = regmap_reg_addr(map, reg); return map->bus->reg_write(map->bus_context, reg, val); } static int _regmap_bus_raw_write(void *context, unsigned int reg, unsigned int val) { struct regmap *map = context; WARN_ON(!map->format.format_val); map->format.format_val(map->work_buf + map->format.reg_bytes + map->format.pad_bytes, val, 0); return _regmap_raw_write_impl(map, reg, map->work_buf + map->format.reg_bytes + map->format.pad_bytes, map->format.val_bytes, false); } static inline void *_regmap_map_get_context(struct regmap *map) { return (map->bus || (!map->bus && map->read)) ? map : map->bus_context; } int _regmap_write(struct regmap *map, unsigned int reg, unsigned int val) { int ret; void *context = _regmap_map_get_context(map); if (!regmap_writeable(map, reg)) return -EIO; if (!map->cache_bypass && !map->defer_caching) { ret = regcache_write(map, reg, val); if (ret != 0) return ret; if (map->cache_only) { map->cache_dirty = true; return 0; } } ret = map->reg_write(context, reg, val); if (ret == 0) { if (regmap_should_log(map)) dev_info(map->dev, "%x <= %x\n", reg, val); trace_regmap_reg_write(map, reg, val); } return ret; } /** * regmap_write() - Write a value to a single register * * @map: Register map to write to * @reg: Register to write to * @val: Value to be written * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_write(struct regmap *map, unsigned int reg, unsigned int val) { int ret; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; map->lock(map->lock_arg); ret = _regmap_write(map, reg, val); map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_write); /** * regmap_write_async() - Write a value to a single register asynchronously * * @map: Register map to write to * @reg: Register to write to * @val: Value to be written * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_write_async(struct regmap *map, unsigned int reg, unsigned int val) { int ret; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; map->lock(map->lock_arg); map->async = true; ret = _regmap_write(map, reg, val); map->async = false; map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_write_async); int _regmap_raw_write(struct regmap *map, unsigned int reg, const void *val, size_t val_len, bool noinc) { size_t val_bytes = map->format.val_bytes; size_t val_count = val_len / val_bytes; size_t chunk_count, chunk_bytes; size_t chunk_regs = val_count; int ret, i; if (!val_count) return -EINVAL; if (map->use_single_write) chunk_regs = 1; else if (map->max_raw_write && val_len > map->max_raw_write) chunk_regs = map->max_raw_write / val_bytes; chunk_count = val_count / chunk_regs; chunk_bytes = chunk_regs * val_bytes; /* Write as many bytes as possible with chunk_size */ for (i = 0; i < chunk_count; i++) { ret = _regmap_raw_write_impl(map, reg, val, chunk_bytes, noinc); if (ret) return ret; reg += regmap_get_offset(map, chunk_regs); val += chunk_bytes; val_len -= chunk_bytes; } /* Write remaining bytes */ if (val_len) ret = _regmap_raw_write_impl(map, reg, val, val_len, noinc); return ret; } /** * regmap_raw_write() - Write raw values to one or more registers * * @map: Register map to write to * @reg: Initial register to write to * @val: Block of data to be written, laid out for direct transmission to the * device * @val_len: Length of data pointed to by val. * * This function is intended to be used for things like firmware * download where a large block of data needs to be transferred to the * device. No formatting will be done on the data provided. * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_raw_write(struct regmap *map, unsigned int reg, const void *val, size_t val_len) { int ret; if (!regmap_can_raw_write(map)) return -EINVAL; if (val_len % map->format.val_bytes) return -EINVAL; map->lock(map->lock_arg); ret = _regmap_raw_write(map, reg, val, val_len, false); map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_raw_write); static int regmap_noinc_readwrite(struct regmap *map, unsigned int reg, void *val, unsigned int val_len, bool write) { size_t val_bytes = map->format.val_bytes; size_t val_count = val_len / val_bytes; unsigned int lastval; u8 *u8p; u16 *u16p; u32 *u32p; int ret; int i; switch (val_bytes) { case 1: u8p = val; if (write) lastval = (unsigned int)u8p[val_count - 1]; break; case 2: u16p = val; if (write) lastval = (unsigned int)u16p[val_count - 1]; break; case 4: u32p = val; if (write) lastval = (unsigned int)u32p[val_count - 1]; break; default: return -EINVAL; } /* * Update the cache with the last value we write, the rest is just * gone down in the hardware FIFO. We can't cache FIFOs. This makes * sure a single read from the cache will work. */ if (write) { if (!map->cache_bypass && !map->defer_caching) { ret = regcache_write(map, reg, lastval); if (ret != 0) return ret; if (map->cache_only) { map->cache_dirty = true; return 0; } } ret = map->bus->reg_noinc_write(map->bus_context, reg, val, val_count); } else { ret = map->bus->reg_noinc_read(map->bus_context, reg, val, val_count); } if (!ret && regmap_should_log(map)) { dev_info(map->dev, "%x %s [", reg, write ? "<=" : "=>"); for (i = 0; i < val_count; i++) { switch (val_bytes) { case 1: pr_cont("%x", u8p[i]); break; case 2: pr_cont("%x", u16p[i]); break; case 4: pr_cont("%x", u32p[i]); break; default: break; } if (i == (val_count - 1)) pr_cont("]\n"); else pr_cont(","); } } return 0; } /** * regmap_noinc_write(): Write data to a register without incrementing the * register number * * @map: Register map to write to * @reg: Register to write to * @val: Pointer to data buffer * @val_len: Length of output buffer in bytes. * * The regmap API usually assumes that bulk bus write operations will write a * range of registers. Some devices have certain registers for which a write * operation can write to an internal FIFO. * * The target register must be volatile but registers after it can be * completely unrelated cacheable registers. * * This will attempt multiple writes as required to write val_len bytes. * * A value of zero will be returned on success, a negative errno will be * returned in error cases. */ int regmap_noinc_write(struct regmap *map, unsigned int reg, const void *val, size_t val_len) { size_t write_len; int ret; if (!map->write && !(map->bus && map->bus->reg_noinc_write)) return -EINVAL; if (val_len % map->format.val_bytes) return -EINVAL; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; if (val_len == 0) return -EINVAL; map->lock(map->lock_arg); if (!regmap_volatile(map, reg) || !regmap_writeable_noinc(map, reg)) { ret = -EINVAL; goto out_unlock; } /* * Use the accelerated operation if we can. The val drops the const * typing in order to facilitate code reuse in regmap_noinc_readwrite(). */ if (map->bus->reg_noinc_write) { ret = regmap_noinc_readwrite(map, reg, (void *)val, val_len, true); goto out_unlock; } while (val_len) { if (map->max_raw_write && map->max_raw_write < val_len) write_len = map->max_raw_write; else write_len = val_len; ret = _regmap_raw_write(map, reg, val, write_len, true); if (ret) goto out_unlock; val = ((u8 *)val) + write_len; val_len -= write_len; } out_unlock: map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_noinc_write); /** * regmap_field_update_bits_base() - Perform a read/modify/write cycle a * register field. * * @field: Register field to write to * @mask: Bitmask to change * @val: Value to be written * @change: Boolean indicating if a write was done * @async: Boolean indicating asynchronously * @force: Boolean indicating use force update * * Perform a read/modify/write cycle on the register field with change, * async, force option. * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_field_update_bits_base(struct regmap_field *field, unsigned int mask, unsigned int val, bool *change, bool async, bool force) { mask = (mask << field->shift) & field->mask; return regmap_update_bits_base(field->regmap, field->reg, mask, val << field->shift, change, async, force); } EXPORT_SYMBOL_GPL(regmap_field_update_bits_base); /** * regmap_field_test_bits() - Check if all specified bits are set in a * register field. * * @field: Register field to operate on * @bits: Bits to test * * Returns negative errno if the underlying regmap_field_read() fails, * 0 if at least one of the tested bits is not set and 1 if all tested * bits are set. */ int regmap_field_test_bits(struct regmap_field *field, unsigned int bits) { unsigned int val; int ret; ret = regmap_field_read(field, &val); if (ret) return ret; return (val & bits) == bits; } EXPORT_SYMBOL_GPL(regmap_field_test_bits); /** * regmap_fields_update_bits_base() - Perform a read/modify/write cycle a * register field with port ID * * @field: Register field to write to * @id: port ID * @mask: Bitmask to change * @val: Value to be written * @change: Boolean indicating if a write was done * @async: Boolean indicating asynchronously * @force: Boolean indicating use force update * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_fields_update_bits_base(struct regmap_field *field, unsigned int id, unsigned int mask, unsigned int val, bool *change, bool async, bool force) { if (id >= field->id_size) return -EINVAL; mask = (mask << field->shift) & field->mask; return regmap_update_bits_base(field->regmap, field->reg + (field->id_offset * id), mask, val << field->shift, change, async, force); } EXPORT_SYMBOL_GPL(regmap_fields_update_bits_base); /** * regmap_bulk_write() - Write multiple registers to the device * * @map: Register map to write to * @reg: First register to be write from * @val: Block of data to be written, in native register size for device * @val_count: Number of registers to write * * This function is intended to be used for writing a large block of * data to the device either in single transfer or multiple transfer. * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val, size_t val_count) { int ret = 0, i; size_t val_bytes = map->format.val_bytes; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; /* * Some devices don't support bulk write, for them we have a series of * single write operations. */ if (!map->write || !map->format.parse_inplace) { map->lock(map->lock_arg); for (i = 0; i < val_count; i++) { unsigned int ival; switch (val_bytes) { case 1: ival = *(u8 *)(val + (i * val_bytes)); break; case 2: ival = *(u16 *)(val + (i * val_bytes)); break; case 4: ival = *(u32 *)(val + (i * val_bytes)); break; default: ret = -EINVAL; goto out; } ret = _regmap_write(map, reg + regmap_get_offset(map, i), ival); if (ret != 0) goto out; } out: map->unlock(map->lock_arg); } else { void *wval; wval = kmemdup_array(val, val_count, val_bytes, map->alloc_flags); if (!wval) return -ENOMEM; for (i = 0; i < val_count * val_bytes; i += val_bytes) map->format.parse_inplace(wval + i); ret = regmap_raw_write(map, reg, wval, val_bytes * val_count); kfree(wval); } if (!ret) trace_regmap_bulk_write(map, reg, val, val_bytes * val_count); return ret; } EXPORT_SYMBOL_GPL(regmap_bulk_write); /* * _regmap_raw_multi_reg_write() * * the (register,newvalue) pairs in regs have not been formatted, but * they are all in the same page and have been changed to being page * relative. The page register has been written if that was necessary. */ static int _regmap_raw_multi_reg_write(struct regmap *map, const struct reg_sequence *regs, size_t num_regs) { int ret; void *buf; int i; u8 *u8; size_t val_bytes = map->format.val_bytes; size_t reg_bytes = map->format.reg_bytes; size_t pad_bytes = map->format.pad_bytes; size_t pair_size = reg_bytes + pad_bytes + val_bytes; size_t len = pair_size * num_regs; if (!len) return -EINVAL; buf = kzalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* We have to linearise by hand. */ u8 = buf; for (i = 0; i < num_regs; i++) { unsigned int reg = regs[i].reg; unsigned int val = regs[i].def; trace_regmap_hw_write_start(map, reg, 1); reg = regmap_reg_addr(map, reg); map->format.format_reg(u8, reg, map->reg_shift); u8 += reg_bytes + pad_bytes; map->format.format_val(u8, val, 0); u8 += val_bytes; } u8 = buf; *u8 |= map->write_flag_mask; ret = map->write(map->bus_context, buf, len); kfree(buf); for (i = 0; i < num_regs; i++) { int reg = regs[i].reg; trace_regmap_hw_write_done(map, reg, 1); } return ret; } static unsigned int _regmap_register_page(struct regmap *map, unsigned int reg, struct regmap_range_node *range) { unsigned int win_page = (reg - range->range_min) / range->window_len; return win_page; } static int _regmap_range_multi_paged_reg_write(struct regmap *map, struct reg_sequence *regs, size_t num_regs) { int ret; int i, n; struct reg_sequence *base; unsigned int this_page = 0; unsigned int page_change = 0; /* * the set of registers are not neccessarily in order, but * since the order of write must be preserved this algorithm * chops the set each time the page changes. This also applies * if there is a delay required at any point in the sequence. */ base = regs; for (i = 0, n = 0; i < num_regs; i++, n++) { unsigned int reg = regs[i].reg; struct regmap_range_node *range; range = _regmap_range_lookup(map, reg); if (range) { unsigned int win_page = _regmap_register_page(map, reg, range); if (i == 0) this_page = win_page; if (win_page != this_page) { this_page = win_page; page_change = 1; } } /* If we have both a page change and a delay make sure to * write the regs and apply the delay before we change the * page. */ if (page_change || regs[i].delay_us) { /* For situations where the first write requires * a delay we need to make sure we don't call * raw_multi_reg_write with n=0 * This can't occur with page breaks as we * never write on the first iteration */ if (regs[i].delay_us && i == 0) n = 1; ret = _regmap_raw_multi_reg_write(map, base, n); if (ret != 0) return ret; if (regs[i].delay_us) { if (map->can_sleep) fsleep(regs[i].delay_us); else udelay(regs[i].delay_us); } base += n; n = 0; if (page_change) { ret = _regmap_select_page(map, &base[n].reg, range, 1); if (ret != 0) return ret; page_change = 0; } } } if (n > 0) return _regmap_raw_multi_reg_write(map, base, n); return 0; } static int _regmap_multi_reg_write(struct regmap *map, const struct reg_sequence *regs, size_t num_regs) { int i; int ret; if (!map->can_multi_write) { for (i = 0; i < num_regs; i++) { ret = _regmap_write(map, regs[i].reg, regs[i].def); if (ret != 0) return ret; if (regs[i].delay_us) { if (map->can_sleep) fsleep(regs[i].delay_us); else udelay(regs[i].delay_us); } } return 0; } if (!map->format.parse_inplace) return -EINVAL; if (map->writeable_reg) for (i = 0; i < num_regs; i++) { int reg = regs[i].reg; if (!map->writeable_reg(map->dev, reg)) return -EINVAL; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; } if (!map->cache_bypass) { for (i = 0; i < num_regs; i++) { unsigned int val = regs[i].def; unsigned int reg = regs[i].reg; ret = regcache_write(map, reg, val); if (ret) { dev_err(map->dev, "Error in caching of register: %x ret: %d\n", reg, ret); return ret; } } if (map->cache_only) { map->cache_dirty = true; return 0; } } WARN_ON(!map->bus); for (i = 0; i < num_regs; i++) { unsigned int reg = regs[i].reg; struct regmap_range_node *range; /* Coalesce all the writes between a page break or a delay * in a sequence */ range = _regmap_range_lookup(map, reg); if (range || regs[i].delay_us) { size_t len = sizeof(struct reg_sequence)*num_regs; struct reg_sequence *base = kmemdup(regs, len, GFP_KERNEL); if (!base) return -ENOMEM; ret = _regmap_range_multi_paged_reg_write(map, base, num_regs); kfree(base); return ret; } } return _regmap_raw_multi_reg_write(map, regs, num_regs); } /** * regmap_multi_reg_write() - Write multiple registers to the device * * @map: Register map to write to * @regs: Array of structures containing register,value to be written * @num_regs: Number of registers to write * * Write multiple registers to the device where the set of register, value * pairs are supplied in any order, possibly not all in a single range. * * The 'normal' block write mode will send ultimately send data on the * target bus as R,V1,V2,V3,..,Vn where successively higher registers are * addressed. However, this alternative block multi write mode will send * the data as R1,V1,R2,V2,..,Rn,Vn on the target bus. The target device * must of course support the mode. * * A value of zero will be returned on success, a negative errno will be * returned in error cases. */ int regmap_multi_reg_write(struct regmap *map, const struct reg_sequence *regs, int num_regs) { int ret; map->lock(map->lock_arg); ret = _regmap_multi_reg_write(map, regs, num_regs); map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_multi_reg_write); /** * regmap_multi_reg_write_bypassed() - Write multiple registers to the * device but not the cache * * @map: Register map to write to * @regs: Array of structures containing register,value to be written * @num_regs: Number of registers to write * * Write multiple registers to the device but not the cache where the set * of register are supplied in any order. * * This function is intended to be used for writing a large block of data * atomically to the device in single transfer for those I2C client devices * that implement this alternative block write mode. * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_multi_reg_write_bypassed(struct regmap *map, const struct reg_sequence *regs, int num_regs) { int ret; bool bypass; map->lock(map->lock_arg); bypass = map->cache_bypass; map->cache_bypass = true; ret = _regmap_multi_reg_write(map, regs, num_regs); map->cache_bypass = bypass; map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_multi_reg_write_bypassed); /** * regmap_raw_write_async() - Write raw values to one or more registers * asynchronously * * @map: Register map to write to * @reg: Initial register to write to * @val: Block of data to be written, laid out for direct transmission to the * device. Must be valid until regmap_async_complete() is called. * @val_len: Length of data pointed to by val. * * This function is intended to be used for things like firmware * download where a large block of data needs to be transferred to the * device. No formatting will be done on the data provided. * * If supported by the underlying bus the write will be scheduled * asynchronously, helping maximise I/O speed on higher speed buses * like SPI. regmap_async_complete() can be called to ensure that all * asynchrnous writes have been completed. * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_raw_write_async(struct regmap *map, unsigned int reg, const void *val, size_t val_len) { int ret; if (val_len % map->format.val_bytes) return -EINVAL; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; map->lock(map->lock_arg); map->async = true; ret = _regmap_raw_write(map, reg, val, val_len, false); map->async = false; map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_raw_write_async); static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val, unsigned int val_len, bool noinc) { struct regmap_range_node *range; int ret; if (!map->read) return -EINVAL; range = _regmap_range_lookup(map, reg); if (range) { ret = _regmap_select_page(map, ®, range, noinc ? 1 : val_len / map->format.val_bytes); if (ret != 0) return ret; } reg = regmap_reg_addr(map, reg); map->format.format_reg(map->work_buf, reg, map->reg_shift); regmap_set_work_buf_flag_mask(map, map->format.reg_bytes, map->read_flag_mask); trace_regmap_hw_read_start(map, reg, val_len / map->format.val_bytes); ret = map->read(map->bus_context, map->work_buf, map->format.reg_bytes + map->format.pad_bytes, val, val_len); trace_regmap_hw_read_done(map, reg, val_len / map->format.val_bytes); return ret; } static int _regmap_bus_reg_read(void *context, unsigned int reg, unsigned int *val) { struct regmap *map = context; struct regmap_range_node *range; int ret; range = _regmap_range_lookup(map, reg); if (range) { ret = _regmap_select_page(map, ®, range, 1); if (ret != 0) return ret; } reg = regmap_reg_addr(map, reg); return map->bus->reg_read(map->bus_context, reg, val); } static int _regmap_bus_read(void *context, unsigned int reg, unsigned int *val) { int ret; struct regmap *map = context; void *work_val = map->work_buf + map->format.reg_bytes + map->format.pad_bytes; if (!map->format.parse_val) return -EINVAL; ret = _regmap_raw_read(map, reg, work_val, map->format.val_bytes, false); if (ret == 0) *val = map->format.parse_val(work_val); return ret; } static int _regmap_read(struct regmap *map, unsigned int reg, unsigned int *val) { int ret; void *context = _regmap_map_get_context(map); if (!map->cache_bypass) { ret = regcache_read(map, reg, val); if (ret == 0) return 0; } if (map->cache_only) return -EBUSY; if (!regmap_readable(map, reg)) return -EIO; ret = map->reg_read(context, reg, val); if (ret == 0) { if (regmap_should_log(map)) dev_info(map->dev, "%x => %x\n", reg, *val); trace_regmap_reg_read(map, reg, *val); if (!map->cache_bypass) regcache_write(map, reg, *val); } return ret; } /** * regmap_read() - Read a value from a single register * * @map: Register map to read from * @reg: Register to be read from * @val: Pointer to store read value * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val) { int ret; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; map->lock(map->lock_arg); ret = _regmap_read(map, reg, val); map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_read); /** * regmap_read_bypassed() - Read a value from a single register direct * from the device, bypassing the cache * * @map: Register map to read from * @reg: Register to be read from * @val: Pointer to store read value * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_read_bypassed(struct regmap *map, unsigned int reg, unsigned int *val) { int ret; bool bypass, cache_only; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; map->lock(map->lock_arg); bypass = map->cache_bypass; cache_only = map->cache_only; map->cache_bypass = true; map->cache_only = false; ret = _regmap_read(map, reg, val); map->cache_bypass = bypass; map->cache_only = cache_only; map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_read_bypassed); /** * regmap_raw_read() - Read raw data from the device * * @map: Register map to read from * @reg: First register to be read from * @val: Pointer to store read value * @val_len: Size of data to read * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_raw_read(struct regmap *map, unsigned int reg, void *val, size_t val_len) { size_t val_bytes = map->format.val_bytes; size_t val_count = val_len / val_bytes; unsigned int v; int ret, i; if (val_len % map->format.val_bytes) return -EINVAL; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; if (val_count == 0) return -EINVAL; map->lock(map->lock_arg); if (regmap_volatile_range(map, reg, val_count) || map->cache_bypass || map->cache_type == REGCACHE_NONE) { size_t chunk_count, chunk_bytes; size_t chunk_regs = val_count; if (!map->cache_bypass && map->cache_only) { ret = -EBUSY; goto out; } if (!map->read) { ret = -ENOTSUPP; goto out; } if (map->use_single_read) chunk_regs = 1; else if (map->max_raw_read && val_len > map->max_raw_read) chunk_regs = map->max_raw_read / val_bytes; chunk_count = val_count / chunk_regs; chunk_bytes = chunk_regs * val_bytes; /* Read bytes that fit into whole chunks */ for (i = 0; i < chunk_count; i++) { ret = _regmap_raw_read(map, reg, val, chunk_bytes, false); if (ret != 0) goto out; reg += regmap_get_offset(map, chunk_regs); val += chunk_bytes; val_len -= chunk_bytes; } /* Read remaining bytes */ if (val_len) { ret = _regmap_raw_read(map, reg, val, val_len, false); if (ret != 0) goto out; } } else { /* Otherwise go word by word for the cache; should be low * cost as we expect to hit the cache. */ for (i = 0; i < val_count; i++) { ret = _regmap_read(map, reg + regmap_get_offset(map, i), &v); if (ret != 0) goto out; map->format.format_val(val + (i * val_bytes), v, 0); } } out: map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_raw_read); /** * regmap_noinc_read(): Read data from a register without incrementing the * register number * * @map: Register map to read from * @reg: Register to read from * @val: Pointer to data buffer * @val_len: Length of output buffer in bytes. * * The regmap API usually assumes that bulk read operations will read a * range of registers. Some devices have certain registers for which a read * operation read will read from an internal FIFO. * * The target register must be volatile but registers after it can be * completely unrelated cacheable registers. * * This will attempt multiple reads as required to read val_len bytes. * * A value of zero will be returned on success, a negative errno will be * returned in error cases. */ int regmap_noinc_read(struct regmap *map, unsigned int reg, void *val, size_t val_len) { size_t read_len; int ret; if (!map->read) return -ENOTSUPP; if (val_len % map->format.val_bytes) return -EINVAL; if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; if (val_len == 0) return -EINVAL; map->lock(map->lock_arg); if (!regmap_volatile(map, reg) || !regmap_readable_noinc(map, reg)) { ret = -EINVAL; goto out_unlock; } /* * We have not defined the FIFO semantics for cache, as the * cache is just one value deep. Should we return the last * written value? Just avoid this by always reading the FIFO * even when using cache. Cache only will not work. */ if (!map->cache_bypass && map->cache_only) { ret = -EBUSY; goto out_unlock; } /* Use the accelerated operation if we can */ if (map->bus->reg_noinc_read) { ret = regmap_noinc_readwrite(map, reg, val, val_len, false); goto out_unlock; } while (val_len) { if (map->max_raw_read && map->max_raw_read < val_len) read_len = map->max_raw_read; else read_len = val_len; ret = _regmap_raw_read(map, reg, val, read_len, true); if (ret) goto out_unlock; val = ((u8 *)val) + read_len; val_len -= read_len; } out_unlock: map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_noinc_read); /** * regmap_field_read(): Read a value to a single register field * * @field: Register field to read from * @val: Pointer to store read value * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_field_read(struct regmap_field *field, unsigned int *val) { int ret; unsigned int reg_val; ret = regmap_read(field->regmap, field->reg, ®_val); if (ret != 0) return ret; reg_val &= field->mask; reg_val >>= field->shift; *val = reg_val; return ret; } EXPORT_SYMBOL_GPL(regmap_field_read); /** * regmap_fields_read() - Read a value to a single register field with port ID * * @field: Register field to read from * @id: port ID * @val: Pointer to store read value * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_fields_read(struct regmap_field *field, unsigned int id, unsigned int *val) { int ret; unsigned int reg_val; if (id >= field->id_size) return -EINVAL; ret = regmap_read(field->regmap, field->reg + (field->id_offset * id), ®_val); if (ret != 0) return ret; reg_val &= field->mask; reg_val >>= field->shift; *val = reg_val; return ret; } EXPORT_SYMBOL_GPL(regmap_fields_read); static int _regmap_bulk_read(struct regmap *map, unsigned int reg, const unsigned int *regs, void *val, size_t val_count) { u32 *u32 = val; u16 *u16 = val; u8 *u8 = val; int ret, i; map->lock(map->lock_arg); for (i = 0; i < val_count; i++) { unsigned int ival; if (regs) { if (!IS_ALIGNED(regs[i], map->reg_stride)) { ret = -EINVAL; goto out; } ret = _regmap_read(map, regs[i], &ival); } else { ret = _regmap_read(map, reg + regmap_get_offset(map, i), &ival); } if (ret != 0) goto out; switch (map->format.val_bytes) { case 4: u32[i] = ival; break; case 2: u16[i] = ival; break; case 1: u8[i] = ival; break; default: ret = -EINVAL; goto out; } } out: map->unlock(map->lock_arg); return ret; } /** * regmap_bulk_read() - Read multiple sequential registers from the device * * @map: Register map to read from * @reg: First register to be read from * @val: Pointer to store read value, in native register size for device * @val_count: Number of registers to read * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val, size_t val_count) { int ret, i; size_t val_bytes = map->format.val_bytes; bool vol = regmap_volatile_range(map, reg, val_count); if (!IS_ALIGNED(reg, map->reg_stride)) return -EINVAL; if (val_count == 0) return -EINVAL; if (map->read && map->format.parse_inplace && (vol || map->cache_type == REGCACHE_NONE)) { ret = regmap_raw_read(map, reg, val, val_bytes * val_count); if (ret != 0) return ret; for (i = 0; i < val_count * val_bytes; i += val_bytes) map->format.parse_inplace(val + i); } else { ret = _regmap_bulk_read(map, reg, NULL, val, val_count); } if (!ret) trace_regmap_bulk_read(map, reg, val, val_bytes * val_count); return ret; } EXPORT_SYMBOL_GPL(regmap_bulk_read); /** * regmap_multi_reg_read() - Read multiple non-sequential registers from the device * * @map: Register map to read from * @regs: Array of registers to read from * @val: Pointer to store read value, in native register size for device * @val_count: Number of registers to read * * A value of zero will be returned on success, a negative errno will * be returned in error cases. */ int regmap_multi_reg_read(struct regmap *map, const unsigned int *regs, void *val, size_t val_count) { if (val_count == 0) return -EINVAL; return _regmap_bulk_read(map, 0, regs, val, val_count); } EXPORT_SYMBOL_GPL(regmap_multi_reg_read); static int _regmap_update_bits(struct regmap *map, unsigned int reg, unsigned int mask, unsigned int val, bool *change, bool force_write) { int ret; unsigned int tmp, orig; if (change) *change = false; if (regmap_volatile(map, reg) && map->reg_update_bits) { reg = regmap_reg_addr(map, reg); ret = map->reg_update_bits(map->bus_context, reg, mask, val); if (ret == 0 && change) *change = true; } else { ret = _regmap_read(map, reg, &orig); if (ret != 0) return ret; tmp = orig & ~mask; tmp |= val & mask; if (force_write || (tmp != orig) || map->force_write_field) { ret = _regmap_write(map, reg, tmp); if (ret == 0 && change) *change = true; } } return ret; } /** * regmap_update_bits_base() - Perform a read/modify/write cycle on a register * * @map: Register map to update * @reg: Register to update * @mask: Bitmask to change * @val: New value for bitmask * @change: Boolean indicating if a write was done * @async: Boolean indicating asynchronously * @force: Boolean indicating use force update * * Perform a read/modify/write cycle on a register map with change, async, force * options. * * If async is true: * * With most buses the read must be done synchronously so this is most useful * for devices with a cache which do not need to interact with the hardware to * determine the current register value. * * Returns zero for success, a negative number on error. */ int regmap_update_bits_base(struct regmap *map, unsigned int reg, unsigned int mask, unsigned int val, bool *change, bool async, bool force) { int ret; map->lock(map->lock_arg); map->async = async; ret = _regmap_update_bits(map, reg, mask, val, change, force); map->async = false; map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regmap_update_bits_base); /** * regmap_test_bits() - Check if all specified bits are set in a register. * * @map: Register map to operate on * @reg: Register to read from * @bits: Bits to test * * Returns 0 if at least one of the tested bits is not set, 1 if all tested * bits are set and a negative error number if the underlying regmap_read() * fails. */ int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits) { unsigned int val; int ret; ret = regmap_read(map, reg, &val); if (ret) return ret; return (val & bits) == bits; } EXPORT_SYMBOL_GPL(regmap_test_bits); void regmap_async_complete_cb(struct regmap_async *async, int ret) { struct regmap *map = async->map; bool wake; trace_regmap_async_io_complete(map); spin_lock(&map->async_lock); list_move(&async->list, &map->async_free); wake = list_empty(&map->async_list); if (ret != 0) map->async_ret = ret; spin_unlock(&map->async_lock); if (wake) wake_up(&map->async_waitq); } EXPORT_SYMBOL_GPL(regmap_async_complete_cb); static int regmap_async_is_done(struct regmap *map) { unsigned long flags; int ret; spin_lock_irqsave(&map->async_lock, flags); ret = list_empty(&map->async_list); spin_unlock_irqrestore(&map->async_lock, flags); return ret; } /** * regmap_async_complete - Ensure all asynchronous I/O has completed. * * @map: Map to operate on. * * Blocks until any pending asynchronous I/O has completed. Returns * an error code for any failed I/O operations. */ int regmap_async_complete(struct regmap *map) { unsigned long flags; int ret; /* Nothing to do with no async support */ if (!map->bus || !map->bus->async_write) return 0; trace_regmap_async_complete_start(map); wait_event(map->async_waitq, regmap_async_is_done(map)); spin_lock_irqsave(&map->async_lock, flags); ret = map->async_ret; map->async_ret = 0; spin_unlock_irqrestore(&map->async_lock, flags); trace_regmap_async_complete_done(map); return ret; } EXPORT_SYMBOL_GPL(regmap_async_complete); /** * regmap_register_patch - Register and apply register updates to be applied * on device initialistion * * @map: Register map to apply updates to. * @regs: Values to update. * @num_regs: Number of entries in regs. * * Register a set of register updates to be applied to the device * whenever the device registers are synchronised with the cache and * apply them immediately. Typically this is used to apply * corrections to be applied to the device defaults on startup, such * as the updates some vendors provide to undocumented registers. * * The caller must ensure that this function cannot be called * concurrently with either itself or regcache_sync(). */ int regmap_register_patch(struct regmap *map, const struct reg_sequence *regs, int num_regs) { struct reg_sequence *p; int ret; bool bypass; if (WARN_ONCE(num_regs <= 0, "invalid registers number (%d)\n", num_regs)) return 0; p = krealloc(map->patch, sizeof(struct reg_sequence) * (map->patch_regs + num_regs), GFP_KERNEL); if (p) { memcpy(p + map->patch_regs, regs, num_regs * sizeof(*regs)); map->patch = p; map->patch_regs += num_regs; } else { return -ENOMEM; } map->lock(map->lock_arg); bypass = map->cache_bypass; map->cache_bypass = true; map->async = true; ret = _regmap_multi_reg_write(map, regs, num_regs); map->async = false; map->cache_bypass = bypass; map->unlock(map->lock_arg); regmap_async_complete(map); return ret; } EXPORT_SYMBOL_GPL(regmap_register_patch); /** * regmap_get_val_bytes() - Report the size of a register value * * @map: Register map to operate on. * * Report the size of a register value, mainly intended to for use by * generic infrastructure built on top of regmap. */ int regmap_get_val_bytes(struct regmap *map) { if (map->format.format_write) return -EINVAL; return map->format.val_bytes; } EXPORT_SYMBOL_GPL(regmap_get_val_bytes); /** * regmap_get_max_register() - Report the max register value * * @map: Register map to operate on. * * Report the max register value, mainly intended to for use by * generic infrastructure built on top of regmap. */ int regmap_get_max_register(struct regmap *map) { return map->max_register_is_set ? map->max_register : -EINVAL; } EXPORT_SYMBOL_GPL(regmap_get_max_register); /** * regmap_get_reg_stride() - Report the register address stride * * @map: Register map to operate on. * * Report the register address stride, mainly intended to for use by * generic infrastructure built on top of regmap. */ int regmap_get_reg_stride(struct regmap *map) { return map->reg_stride; } EXPORT_SYMBOL_GPL(regmap_get_reg_stride); /** * regmap_might_sleep() - Returns whether a regmap access might sleep. * * @map: Register map to operate on. * * Returns true if an access to the register might sleep, else false. */ bool regmap_might_sleep(struct regmap *map) { return map->can_sleep; } EXPORT_SYMBOL_GPL(regmap_might_sleep); int regmap_parse_val(struct regmap *map, const void *buf, unsigned int *val) { if (!map->format.parse_val) return -EINVAL; *val = map->format.parse_val(buf); return 0; } EXPORT_SYMBOL_GPL(regmap_parse_val); static int __init regmap_initcall(void) { regmap_debugfs_initcall(); return 0; } postcore_initcall(regmap_initcall); |
| 153 2 152 152 63 65 150 150 152 153 129 36 149 4 153 153 153 158 9 3 6 110 21 20 157 10 135 21 22 156 157 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/skbuff.h> #include <linux/sctp.h> #include <net/gso.h> #include <net/gro.h> /** * skb_eth_gso_segment - segmentation handler for ethernet protocols. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @type: Ethernet Protocol ID */ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); return segs; } EXPORT_SYMBOL(skb_eth_gso_segment); /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) */ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; int vlan_depth = skb->mac_len; __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); __skb_push(skb, skb->data - skb_mac_header(skb)); return segs; } EXPORT_SYMBOL(skb_mac_gso_segment); /* openvswitch calls this on rx path, so we need a different check. */ static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } /** * __skb_gso_segment - Perform segmentation on skb. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @tx_path: whether it is called in TX path * * This function segments the given skb and returns a list of segments. * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { struct sk_buff *segs; if (unlikely(skb_needs_check(skb, tx_path))) { int err; /* We're going to init ->check field in TCP or UDP header */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); } /* Only report GSO partial support if it will enable us to * support segmentation on this frame without needing additional * work. */ if (features & NETIF_F_GSO_PARTIAL) { netdev_features_t partial_features = NETIF_F_GSO_ROBUST; struct net_device *dev = skb->dev; partial_features |= dev->features & dev->gso_partial_features; if (!skb_gso_ok(skb, features | partial_features)) features &= ~NETIF_F_GSO_PARTIAL; } BUILD_BUG_ON(SKB_GSO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; skb_reset_mac_header(skb); skb_reset_mac_len(skb); segs = skb_mac_gso_segment(skb, features); if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; } EXPORT_SYMBOL(__skb_gso_segment); /** * skb_gso_transport_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_transport_seglen is used to determine the real size of the * individual segments, including Layer4 headers (TCP/UDP). * * The MAC/L2 or network (IP, IPv6) headers are not accounted for. */ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int thlen = 0; if (skb->encapsulation) { thlen = skb_inner_transport_header(skb) - skb_transport_header(skb); if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); } else if (unlikely(skb_is_gso_sctp(skb))) { thlen = sizeof(struct sctphdr); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { thlen = sizeof(struct udphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already * accounted for. */ return thlen + shinfo->gso_size; } /** * skb_gso_network_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_network_seglen is used to determine the real size of the * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). * * The MAC/L2 header is not accounted for. */ static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_network_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_mac_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_mac_seglen is used to determine the real size of the * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 * headers (TCP/UDP). */ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS * * There are a couple of instances where we have a GSO skb, and we * want to determine what size it would be after it is segmented. * * We might want to check: * - L3+L4+payload size (e.g. IP forwarding) * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * * @skb: GSO skb * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * * @max_len: The maximum permissible length. * * Returns true if the segmented length <= max length. */ static inline bool skb_gso_size_check(const struct sk_buff *skb, unsigned int seg_len, unsigned int max_len) { const struct skb_shared_info *shinfo = skb_shinfo(skb); const struct sk_buff *iter; if (shinfo->gso_size != GSO_BY_FRAGS) return seg_len <= max_len; /* Undo this so we can re-use header sizes */ seg_len -= GSO_BY_FRAGS; skb_walk_frags(skb, iter) { if (seg_len + skb_headlen(iter) > max_len) return false; } return true; } /** * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? * * @skb: GSO skb * @mtu: MTU to validate against * * skb_gso_validate_network_len validates if a given skb will fit a * wanted MTU once split. It considers L3 headers, L4 headers, and the * payload. */ bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) { return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); } EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); /** * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? * * @skb: GSO skb * @len: length to validate against * * skb_gso_validate_mac_len validates if a given skb will fit a wanted * length once split, including L2, L3 and L4 headers and the payload. */ bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) { return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); } EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); |
| 35 35 31 31 31 30 30 4 26 17 9 7 7 6 7 7 7 6 5 7 6 8 7 7 1 1 5 1 1 3 30 30 30 30 30 30 30 30 29 7 1 34 1 30 7 30 30 7 7 1 6 6 6 1 5 5 35 35 35 33 2 38 39 35 39 36 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 | // SPDX-License-Identifier: 0BSD /* * .xz Stream decoder * * Author: Lasse Collin <lasse.collin@tukaani.org> */ #include "xz_private.h" #include "xz_stream.h" /* Hash used to validate the Index field */ struct xz_dec_hash { vli_type unpadded; vli_type uncompressed; uint32_t crc32; }; struct xz_dec { /* Position in dec_main() */ enum { SEQ_STREAM_HEADER, SEQ_BLOCK_START, SEQ_BLOCK_HEADER, SEQ_BLOCK_UNCOMPRESS, SEQ_BLOCK_PADDING, SEQ_BLOCK_CHECK, SEQ_INDEX, SEQ_INDEX_PADDING, SEQ_INDEX_CRC32, SEQ_STREAM_FOOTER } sequence; /* Position in variable-length integers and Check fields */ uint32_t pos; /* Variable-length integer decoded by dec_vli() */ vli_type vli; /* Saved in_pos and out_pos */ size_t in_start; size_t out_start; /* CRC32 value in Block or Index */ uint32_t crc32; /* Type of the integrity check calculated from uncompressed data */ enum xz_check check_type; /* Operation mode */ enum xz_mode mode; /* * True if the next call to xz_dec_run() is allowed to return * XZ_BUF_ERROR. */ bool allow_buf_error; /* Information stored in Block Header */ struct { /* * Value stored in the Compressed Size field, or * VLI_UNKNOWN if Compressed Size is not present. */ vli_type compressed; /* * Value stored in the Uncompressed Size field, or * VLI_UNKNOWN if Uncompressed Size is not present. */ vli_type uncompressed; /* Size of the Block Header field */ uint32_t size; } block_header; /* Information collected when decoding Blocks */ struct { /* Observed compressed size of the current Block */ vli_type compressed; /* Observed uncompressed size of the current Block */ vli_type uncompressed; /* Number of Blocks decoded so far */ vli_type count; /* * Hash calculated from the Block sizes. This is used to * validate the Index field. */ struct xz_dec_hash hash; } block; /* Variables needed when verifying the Index field */ struct { /* Position in dec_index() */ enum { SEQ_INDEX_COUNT, SEQ_INDEX_UNPADDED, SEQ_INDEX_UNCOMPRESSED } sequence; /* Size of the Index in bytes */ vli_type size; /* Number of Records (matches block.count in valid files) */ vli_type count; /* * Hash calculated from the Records (matches block.hash in * valid files). */ struct xz_dec_hash hash; } index; /* * Temporary buffer needed to hold Stream Header, Block Header, * and Stream Footer. The Block Header is the biggest (1 KiB) * so we reserve space according to that. buf[] has to be aligned * to a multiple of four bytes; the size_t variables before it * should guarantee this. */ struct { size_t pos; size_t size; uint8_t buf[1024]; } temp; struct xz_dec_lzma2 *lzma2; #ifdef XZ_DEC_BCJ struct xz_dec_bcj *bcj; bool bcj_active; #endif }; #ifdef XZ_DEC_ANY_CHECK /* Sizes of the Check field with different Check IDs */ static const uint8_t check_sizes[16] = { 0, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64 }; #endif /* * Fill s->temp by copying data starting from b->in[b->in_pos]. Caller * must have set s->temp.pos to indicate how much data we are supposed * to copy into s->temp.buf. Return true once s->temp.pos has reached * s->temp.size. */ static bool fill_temp(struct xz_dec *s, struct xz_buf *b) { size_t copy_size = min_t(size_t, b->in_size - b->in_pos, s->temp.size - s->temp.pos); memcpy(s->temp.buf + s->temp.pos, b->in + b->in_pos, copy_size); b->in_pos += copy_size; s->temp.pos += copy_size; if (s->temp.pos == s->temp.size) { s->temp.pos = 0; return true; } return false; } /* Decode a variable-length integer (little-endian base-128 encoding) */ static enum xz_ret dec_vli(struct xz_dec *s, const uint8_t *in, size_t *in_pos, size_t in_size) { uint8_t byte; if (s->pos == 0) s->vli = 0; while (*in_pos < in_size) { byte = in[*in_pos]; ++*in_pos; s->vli |= (vli_type)(byte & 0x7F) << s->pos; if ((byte & 0x80) == 0) { /* Don't allow non-minimal encodings. */ if (byte == 0 && s->pos != 0) return XZ_DATA_ERROR; s->pos = 0; return XZ_STREAM_END; } s->pos += 7; if (s->pos == 7 * VLI_BYTES_MAX) return XZ_DATA_ERROR; } return XZ_OK; } /* * Decode the Compressed Data field from a Block. Update and validate * the observed compressed and uncompressed sizes of the Block so that * they don't exceed the values possibly stored in the Block Header * (validation assumes that no integer overflow occurs, since vli_type * is normally uint64_t). Update the CRC32 if presence of the CRC32 * field was indicated in Stream Header. * * Once the decoding is finished, validate that the observed sizes match * the sizes possibly stored in the Block Header. Update the hash and * Block count, which are later used to validate the Index field. */ static enum xz_ret dec_block(struct xz_dec *s, struct xz_buf *b) { enum xz_ret ret; s->in_start = b->in_pos; s->out_start = b->out_pos; #ifdef XZ_DEC_BCJ if (s->bcj_active) ret = xz_dec_bcj_run(s->bcj, s->lzma2, b); else #endif ret = xz_dec_lzma2_run(s->lzma2, b); s->block.compressed += b->in_pos - s->in_start; s->block.uncompressed += b->out_pos - s->out_start; /* * There is no need to separately check for VLI_UNKNOWN, since * the observed sizes are always smaller than VLI_UNKNOWN. */ if (s->block.compressed > s->block_header.compressed || s->block.uncompressed > s->block_header.uncompressed) return XZ_DATA_ERROR; if (s->check_type == XZ_CHECK_CRC32) s->crc32 = xz_crc32(b->out + s->out_start, b->out_pos - s->out_start, s->crc32); if (ret == XZ_STREAM_END) { if (s->block_header.compressed != VLI_UNKNOWN && s->block_header.compressed != s->block.compressed) return XZ_DATA_ERROR; if (s->block_header.uncompressed != VLI_UNKNOWN && s->block_header.uncompressed != s->block.uncompressed) return XZ_DATA_ERROR; s->block.hash.unpadded += s->block_header.size + s->block.compressed; #ifdef XZ_DEC_ANY_CHECK s->block.hash.unpadded += check_sizes[s->check_type]; #else if (s->check_type == XZ_CHECK_CRC32) s->block.hash.unpadded += 4; #endif s->block.hash.uncompressed += s->block.uncompressed; s->block.hash.crc32 = xz_crc32( (const uint8_t *)&s->block.hash, sizeof(s->block.hash), s->block.hash.crc32); ++s->block.count; } return ret; } /* Update the Index size and the CRC32 value. */ static void index_update(struct xz_dec *s, const struct xz_buf *b) { size_t in_used = b->in_pos - s->in_start; s->index.size += in_used; s->crc32 = xz_crc32(b->in + s->in_start, in_used, s->crc32); } /* * Decode the Number of Records, Unpadded Size, and Uncompressed Size * fields from the Index field. That is, Index Padding and CRC32 are not * decoded by this function. * * This can return XZ_OK (more input needed), XZ_STREAM_END (everything * successfully decoded), or XZ_DATA_ERROR (input is corrupt). */ static enum xz_ret dec_index(struct xz_dec *s, struct xz_buf *b) { enum xz_ret ret; do { ret = dec_vli(s, b->in, &b->in_pos, b->in_size); if (ret != XZ_STREAM_END) { index_update(s, b); return ret; } switch (s->index.sequence) { case SEQ_INDEX_COUNT: s->index.count = s->vli; /* * Validate that the Number of Records field * indicates the same number of Records as * there were Blocks in the Stream. */ if (s->index.count != s->block.count) return XZ_DATA_ERROR; s->index.sequence = SEQ_INDEX_UNPADDED; break; case SEQ_INDEX_UNPADDED: s->index.hash.unpadded += s->vli; s->index.sequence = SEQ_INDEX_UNCOMPRESSED; break; case SEQ_INDEX_UNCOMPRESSED: s->index.hash.uncompressed += s->vli; s->index.hash.crc32 = xz_crc32( (const uint8_t *)&s->index.hash, sizeof(s->index.hash), s->index.hash.crc32); --s->index.count; s->index.sequence = SEQ_INDEX_UNPADDED; break; } } while (s->index.count > 0); return XZ_STREAM_END; } /* * Validate that the next four input bytes match the value of s->crc32. * s->pos must be zero when starting to validate the first byte. */ static enum xz_ret crc32_validate(struct xz_dec *s, struct xz_buf *b) { do { if (b->in_pos == b->in_size) return XZ_OK; if (((s->crc32 >> s->pos) & 0xFF) != b->in[b->in_pos++]) return XZ_DATA_ERROR; s->pos += 8; } while (s->pos < 32); s->crc32 = 0; s->pos = 0; return XZ_STREAM_END; } #ifdef XZ_DEC_ANY_CHECK /* * Skip over the Check field when the Check ID is not supported. * Returns true once the whole Check field has been skipped over. */ static bool check_skip(struct xz_dec *s, struct xz_buf *b) { while (s->pos < check_sizes[s->check_type]) { if (b->in_pos == b->in_size) return false; ++b->in_pos; ++s->pos; } s->pos = 0; return true; } #endif /* Decode the Stream Header field (the first 12 bytes of the .xz Stream). */ static enum xz_ret dec_stream_header(struct xz_dec *s) { if (!memeq(s->temp.buf, HEADER_MAGIC, HEADER_MAGIC_SIZE)) return XZ_FORMAT_ERROR; if (xz_crc32(s->temp.buf + HEADER_MAGIC_SIZE, 2, 0) != get_le32(s->temp.buf + HEADER_MAGIC_SIZE + 2)) return XZ_DATA_ERROR; if (s->temp.buf[HEADER_MAGIC_SIZE] != 0) return XZ_OPTIONS_ERROR; /* * Of integrity checks, we support only none (Check ID = 0) and * CRC32 (Check ID = 1). However, if XZ_DEC_ANY_CHECK is defined, * we will accept other check types too, but then the check won't * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. */ if (s->temp.buf[HEADER_MAGIC_SIZE + 1] > XZ_CHECK_MAX) return XZ_OPTIONS_ERROR; s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; #ifdef XZ_DEC_ANY_CHECK if (s->check_type > XZ_CHECK_CRC32) return XZ_UNSUPPORTED_CHECK; #else if (s->check_type > XZ_CHECK_CRC32) return XZ_OPTIONS_ERROR; #endif return XZ_OK; } /* Decode the Stream Footer field (the last 12 bytes of the .xz Stream) */ static enum xz_ret dec_stream_footer(struct xz_dec *s) { if (!memeq(s->temp.buf + 10, FOOTER_MAGIC, FOOTER_MAGIC_SIZE)) return XZ_DATA_ERROR; if (xz_crc32(s->temp.buf + 4, 6, 0) != get_le32(s->temp.buf)) return XZ_DATA_ERROR; /* * Validate Backward Size. Note that we never added the size of the * Index CRC32 field to s->index.size, thus we use s->index.size / 4 * instead of s->index.size / 4 - 1. */ if ((s->index.size >> 2) != get_le32(s->temp.buf + 4)) return XZ_DATA_ERROR; if (s->temp.buf[8] != 0 || s->temp.buf[9] != s->check_type) return XZ_DATA_ERROR; /* * Use XZ_STREAM_END instead of XZ_OK to be more convenient * for the caller. */ return XZ_STREAM_END; } /* Decode the Block Header and initialize the filter chain. */ static enum xz_ret dec_block_header(struct xz_dec *s) { enum xz_ret ret; /* * Validate the CRC32. We know that the temp buffer is at least * eight bytes so this is safe. */ s->temp.size -= 4; if (xz_crc32(s->temp.buf, s->temp.size, 0) != get_le32(s->temp.buf + s->temp.size)) return XZ_DATA_ERROR; s->temp.pos = 2; /* * Catch unsupported Block Flags. We support only one or two filters * in the chain, so we catch that with the same test. */ #ifdef XZ_DEC_BCJ if (s->temp.buf[1] & 0x3E) #else if (s->temp.buf[1] & 0x3F) #endif return XZ_OPTIONS_ERROR; /* Compressed Size */ if (s->temp.buf[1] & 0x40) { if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) != XZ_STREAM_END) return XZ_DATA_ERROR; s->block_header.compressed = s->vli; } else { s->block_header.compressed = VLI_UNKNOWN; } /* Uncompressed Size */ if (s->temp.buf[1] & 0x80) { if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) != XZ_STREAM_END) return XZ_DATA_ERROR; s->block_header.uncompressed = s->vli; } else { s->block_header.uncompressed = VLI_UNKNOWN; } #ifdef XZ_DEC_BCJ /* If there are two filters, the first one must be a BCJ filter. */ s->bcj_active = s->temp.buf[1] & 0x01; if (s->bcj_active) { if (s->temp.size - s->temp.pos < 2) return XZ_OPTIONS_ERROR; ret = xz_dec_bcj_reset(s->bcj, s->temp.buf[s->temp.pos++]); if (ret != XZ_OK) return ret; /* * We don't support custom start offset, * so Size of Properties must be zero. */ if (s->temp.buf[s->temp.pos++] != 0x00) return XZ_OPTIONS_ERROR; } #endif /* Valid Filter Flags always take at least two bytes. */ if (s->temp.size - s->temp.pos < 2) return XZ_DATA_ERROR; /* Filter ID = LZMA2 */ if (s->temp.buf[s->temp.pos++] != 0x21) return XZ_OPTIONS_ERROR; /* Size of Properties = 1-byte Filter Properties */ if (s->temp.buf[s->temp.pos++] != 0x01) return XZ_OPTIONS_ERROR; /* Filter Properties contains LZMA2 dictionary size. */ if (s->temp.size - s->temp.pos < 1) return XZ_DATA_ERROR; ret = xz_dec_lzma2_reset(s->lzma2, s->temp.buf[s->temp.pos++]); if (ret != XZ_OK) return ret; /* The rest must be Header Padding. */ while (s->temp.pos < s->temp.size) if (s->temp.buf[s->temp.pos++] != 0x00) return XZ_OPTIONS_ERROR; s->temp.pos = 0; s->block.compressed = 0; s->block.uncompressed = 0; return XZ_OK; } static enum xz_ret dec_main(struct xz_dec *s, struct xz_buf *b) { enum xz_ret ret; /* * Store the start position for the case when we are in the middle * of the Index field. */ s->in_start = b->in_pos; while (true) { switch (s->sequence) { case SEQ_STREAM_HEADER: /* * Stream Header is copied to s->temp, and then * decoded from there. This way if the caller * gives us only little input at a time, we can * still keep the Stream Header decoding code * simple. Similar approach is used in many places * in this file. */ if (!fill_temp(s, b)) return XZ_OK; /* * If dec_stream_header() returns * XZ_UNSUPPORTED_CHECK, it is still possible * to continue decoding if working in multi-call * mode. Thus, update s->sequence before calling * dec_stream_header(). */ s->sequence = SEQ_BLOCK_START; ret = dec_stream_header(s); if (ret != XZ_OK) return ret; fallthrough; case SEQ_BLOCK_START: /* We need one byte of input to continue. */ if (b->in_pos == b->in_size) return XZ_OK; /* See if this is the beginning of the Index field. */ if (b->in[b->in_pos] == 0) { s->in_start = b->in_pos++; s->sequence = SEQ_INDEX; break; } /* * Calculate the size of the Block Header and * prepare to decode it. */ s->block_header.size = ((uint32_t)b->in[b->in_pos] + 1) * 4; s->temp.size = s->block_header.size; s->temp.pos = 0; s->sequence = SEQ_BLOCK_HEADER; fallthrough; case SEQ_BLOCK_HEADER: if (!fill_temp(s, b)) return XZ_OK; ret = dec_block_header(s); if (ret != XZ_OK) return ret; s->sequence = SEQ_BLOCK_UNCOMPRESS; fallthrough; case SEQ_BLOCK_UNCOMPRESS: ret = dec_block(s, b); if (ret != XZ_STREAM_END) return ret; s->sequence = SEQ_BLOCK_PADDING; fallthrough; case SEQ_BLOCK_PADDING: /* * Size of Compressed Data + Block Padding * must be a multiple of four. We don't need * s->block.compressed for anything else * anymore, so we use it here to test the size * of the Block Padding field. */ while (s->block.compressed & 3) { if (b->in_pos == b->in_size) return XZ_OK; if (b->in[b->in_pos++] != 0) return XZ_DATA_ERROR; ++s->block.compressed; } s->sequence = SEQ_BLOCK_CHECK; fallthrough; case SEQ_BLOCK_CHECK: if (s->check_type == XZ_CHECK_CRC32) { ret = crc32_validate(s, b); if (ret != XZ_STREAM_END) return ret; } #ifdef XZ_DEC_ANY_CHECK else if (!check_skip(s, b)) { return XZ_OK; } #endif s->sequence = SEQ_BLOCK_START; break; case SEQ_INDEX: ret = dec_index(s, b); if (ret != XZ_STREAM_END) return ret; s->sequence = SEQ_INDEX_PADDING; fallthrough; case SEQ_INDEX_PADDING: while ((s->index.size + (b->in_pos - s->in_start)) & 3) { if (b->in_pos == b->in_size) { index_update(s, b); return XZ_OK; } if (b->in[b->in_pos++] != 0) return XZ_DATA_ERROR; } /* Finish the CRC32 value and Index size. */ index_update(s, b); /* Compare the hashes to validate the Index field. */ if (!memeq(&s->block.hash, &s->index.hash, sizeof(s->block.hash))) return XZ_DATA_ERROR; s->sequence = SEQ_INDEX_CRC32; fallthrough; case SEQ_INDEX_CRC32: ret = crc32_validate(s, b); if (ret != XZ_STREAM_END) return ret; s->temp.size = STREAM_HEADER_SIZE; s->sequence = SEQ_STREAM_FOOTER; fallthrough; case SEQ_STREAM_FOOTER: if (!fill_temp(s, b)) return XZ_OK; return dec_stream_footer(s); } } /* Never reached */ } /* * xz_dec_run() is a wrapper for dec_main() to handle some special cases in * multi-call and single-call decoding. * * In multi-call mode, we must return XZ_BUF_ERROR when it seems clear that we * are not going to make any progress anymore. This is to prevent the caller * from calling us infinitely when the input file is truncated or otherwise * corrupt. Since zlib-style API allows that the caller fills the input buffer * only when the decoder doesn't produce any new output, we have to be careful * to avoid returning XZ_BUF_ERROR too easily: XZ_BUF_ERROR is returned only * after the second consecutive call to xz_dec_run() that makes no progress. * * In single-call mode, if we couldn't decode everything and no error * occurred, either the input is truncated or the output buffer is too small. * Since we know that the last input byte never produces any output, we know * that if all the input was consumed and decoding wasn't finished, the file * must be corrupt. Otherwise the output buffer has to be too small or the * file is corrupt in a way that decoding it produces too big output. * * If single-call decoding fails, we reset b->in_pos and b->out_pos back to * their original values. This is because with some filter chains there won't * be any valid uncompressed data in the output buffer unless the decoding * actually succeeds (that's the price to pay of using the output buffer as * the workspace). */ enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b) { size_t in_start; size_t out_start; enum xz_ret ret; if (DEC_IS_SINGLE(s->mode)) xz_dec_reset(s); in_start = b->in_pos; out_start = b->out_pos; ret = dec_main(s, b); if (DEC_IS_SINGLE(s->mode)) { if (ret == XZ_OK) ret = b->in_pos == b->in_size ? XZ_DATA_ERROR : XZ_BUF_ERROR; if (ret != XZ_STREAM_END) { b->in_pos = in_start; b->out_pos = out_start; } } else if (ret == XZ_OK && in_start == b->in_pos && out_start == b->out_pos) { if (s->allow_buf_error) ret = XZ_BUF_ERROR; s->allow_buf_error = true; } else { s->allow_buf_error = false; } return ret; } struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max) { struct xz_dec *s = kmalloc_obj(*s); if (s == NULL) return NULL; s->mode = mode; #ifdef XZ_DEC_BCJ s->bcj = xz_dec_bcj_create(DEC_IS_SINGLE(mode)); if (s->bcj == NULL) goto error_bcj; #endif s->lzma2 = xz_dec_lzma2_create(mode, dict_max); if (s->lzma2 == NULL) goto error_lzma2; xz_dec_reset(s); return s; error_lzma2: #ifdef XZ_DEC_BCJ xz_dec_bcj_end(s->bcj); error_bcj: #endif kfree(s); return NULL; } void xz_dec_reset(struct xz_dec *s) { s->sequence = SEQ_STREAM_HEADER; s->allow_buf_error = false; s->pos = 0; s->crc32 = 0; memzero(&s->block, sizeof(s->block)); memzero(&s->index, sizeof(s->index)); s->temp.pos = 0; s->temp.size = STREAM_HEADER_SIZE; } void xz_dec_end(struct xz_dec *s) { if (s != NULL) { xz_dec_lzma2_end(s->lzma2); #ifdef XZ_DEC_BCJ xz_dec_bcj_end(s->bcj); #endif kfree(s); } } |
| 10 7 16 10 14 3 16 8 9 7 7 5 6 1 4 16 7 7 3 55 1 19 1 34 5 6 42 42 8 34 4 2 10 29 9 5 24 1 3 2 2 4 8 8 8 35 41 34 9 9 7 25 1 3 2 15 9 8 4 4 12 4 14 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/signal.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an * underlying rtmutex. The task which is about to be requeued could have * just woken up (timeout, signal). After the wake up the task has to * acquire hash bucket lock, which is held by the requeue code. As a task * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking * and the hash bucket lock blocking would collide and corrupt state. * * On !PREEMPT_RT this is not a problem and everything could be serialized * on hash bucket lock, but aside of having the benefit of common code, * this allows to avoid doing the requeue when the task is already on the * way out and taking the hash bucket lock of the original uaddr1 when the * requeue has been completed. * * The following state transitions are valid: * * On the waiter side: * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT * * On the requeue side: * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) * * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this * signals that the waiter is already on the way out. It also means that * the waiter is still on the 'wait' futex, i.e. uaddr1. * * The waiter side signals early wakeup to the requeue side either through * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, * which means the wakeup is interleaving with a requeue in progress it has * to wait for the requeue side to change the state. Either to DONE/LOCKED * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by * the requeue side when the requeue attempt failed via deadlock detection * and therefore the waiter q is still on the uaddr1 futex. */ enum { Q_REQUEUE_PI_NONE = 0, Q_REQUEUE_PI_IGNORE, Q_REQUEUE_PI_IN_PROGRESS, Q_REQUEUE_PI_WAIT, Q_REQUEUE_PI_DONE, Q_REQUEUE_PI_LOCKED, }; const struct futex_q futex_q_init = { /* list gets initialized in futex_queue()*/ .wake = futex_wake_mark, .key = FUTEX_KEY_INIT, .bitset = FUTEX_BITSET_MATCH_ANY, .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), }; /** * requeue_futex() - Requeue a futex_q from one hb to another * @q: the futex_q to requeue * @hb1: the source hash_bucket * @hb2: the target hash_bucket * @key2: the new key for the requeued futex_q */ static inline void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2, union futex_key *key2) { /* * If key1 and key2 hash to the same bucket, no need to * requeue. */ if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); futex_hb_waiters_dec(hb1); futex_hb_waiters_inc(hb2); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; /* * hb1 and hb2 belong to the same futex_hash_bucket_private * because if we managed get a reference on hb1 then it can't be * replaced. Therefore we avoid put(hb1)+get(hb2) here. */ } q->key = *key2; } static inline bool futex_requeue_pi_prepare(struct futex_q *q, struct futex_pi_state *pi_state) { int old, new; /* * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has * already set Q_REQUEUE_PI_IGNORE to signal that requeue should * ignore the waiter. */ old = atomic_read_acquire(&q->requeue_state); do { if (old == Q_REQUEUE_PI_IGNORE) return false; /* * futex_proxy_trylock_atomic() might have set it to * IN_PROGRESS and a interleaved early wake to WAIT. * * It was considered to have an extra state for that * trylock, but that would just add more conditionals * all over the place for a dubious value. */ if (old != Q_REQUEUE_PI_NONE) break; new = Q_REQUEUE_PI_IN_PROGRESS; } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); q->pi_state = pi_state; return true; } static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) { int old, new; old = atomic_read_acquire(&q->requeue_state); do { if (old == Q_REQUEUE_PI_IGNORE) return; if (locked >= 0) { /* Requeue succeeded. Set DONE or LOCKED */ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && old != Q_REQUEUE_PI_WAIT); new = Q_REQUEUE_PI_DONE + locked; } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { /* Deadlock, no early wakeup interleave */ new = Q_REQUEUE_PI_NONE; } else { /* Deadlock, early wakeup interleave. */ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); new = Q_REQUEUE_PI_IGNORE; } } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); #ifdef CONFIG_PREEMPT_RT /* If the waiter interleaved with the requeue let it know */ if (unlikely(old == Q_REQUEUE_PI_WAIT)) rcuwait_wake_up(&q->requeue_wait); #endif } static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) { int old, new; old = atomic_read_acquire(&q->requeue_state); do { /* Is requeue done already? */ if (old >= Q_REQUEUE_PI_DONE) return old; /* * If not done, then tell the requeue code to either ignore * the waiter or to wake it up once the requeue is done. */ new = Q_REQUEUE_PI_WAIT; if (old == Q_REQUEUE_PI_NONE) new = Q_REQUEUE_PI_IGNORE; } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); /* If the requeue was in progress, wait for it to complete */ if (old == Q_REQUEUE_PI_IN_PROGRESS) { #ifdef CONFIG_PREEMPT_RT rcuwait_wait_event(&q->requeue_wait, atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, TASK_UNINTERRUPTIBLE); #else (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); #endif } /* * Requeue is now either prohibited or complete. Reread state * because during the wait above it might have changed. Nothing * will modify q->requeue_state after this point. */ return atomic_read(&q->requeue_state); } /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * @q: the futex_q * @key: the key of the requeue target futex * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. * * 1) Set @q::key to the requeue target futex key so the waiter can detect * the wakeup on the right futex. * * 2) Dequeue @q from the hash bucket. * * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock * acquisition. * * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that * the waiter has to fixup the pi state. * * 5) Complete the requeue state so the waiter can make progress. After * this point the waiter task can return from the syscall immediately in * case that the pi state does not have to be fixed up. * * 6) Wake the waiter task. * * Must be called with both q->lock_ptr and hb->lock held. */ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { struct task_struct *task; q->key = *key; __futex_unqueue(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; /* * Acquire a reference for the waiter to ensure valid * futex_q::lock_ptr. */ futex_hash_get(hb); q->drop_hb_ref = true; q->lock_ptr = &hb->lock; task = READ_ONCE(q->task); /* Signal locked state to the waiter */ futex_requeue_pi_complete(q, 1); wake_up_state(task, TASK_NORMAL); } /** * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter * @pifutex: the user address of the to futex * @hb1: the from futex hash bucket, must be locked by the caller * @hb2: the to futex hash bucket, must be locked by the caller * @key1: the from futex key * @key2: the to futex key * @ps: address to store the pi_state pointer * @exiting: Pointer to store the task pointer of the owner task * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. * Wake the top waiter if we succeed. If the caller specified set_waiters, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * * @exiting is only set when the return value is -EBUSY. If so, this holds * a refcount on the exiting task on return and the caller needs to drop it * after waiting for the exit to complete. * * Return: * - 0 - failed to acquire the lock atomically; * - >0 - acquired the lock, return value is vpid of the top_waiter * - <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2, union futex_key *key1, union futex_key *key2, struct futex_pi_state **ps, struct task_struct **exiting, int set_waiters) { struct futex_q *top_waiter; u32 curval; int ret; if (futex_get_value_locked(&curval, pifutex)) return -EFAULT; if (unlikely(should_fail_futex(true))) return -EFAULT; /* * Find the top_waiter and determine if there are additional waiters. * If the caller intends to requeue more than 1 waiter to pifutex, * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, * as we have means to handle the possible fault. If not, don't set * the bit unnecessarily as it will force the subsequent unlock to enter * the kernel. */ top_waiter = futex_top_waiter(hb1, key1); /* There are no waiters, nothing for us to do. */ if (!top_waiter) return 0; /* * Ensure that this is a waiter sitting in futex_wait_requeue_pi() * and waiting on the 'waitqueue' futex which is always !PI. */ if (!top_waiter->rt_waiter || top_waiter->pi_state) return -EINVAL; /* Ensure we requeue to the expected futex. */ if (!futex_match(top_waiter->requeue_pi_key, key2)) return -EINVAL; /* Ensure that this does not race against an early wakeup */ if (!futex_requeue_pi_prepare(top_waiter, NULL)) return -EAGAIN; /* * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit * in the contended case or if @set_waiters is true. * * In the contended case PI state is attached to the lock owner. If * the user space lock can be acquired then PI state is attached to * the new owner (@top_waiter->task) when @set_waiters is true. */ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { /* * Lock was acquired in user space and PI state was * attached to @top_waiter->task. That means state is fully * consistent and the waiter can return to user space * immediately after the wakeup. */ requeue_pi_wake_futex(top_waiter, key2, hb2); } else if (ret < 0) { /* Rewind top_waiter::requeue_state */ futex_requeue_pi_complete(top_waiter, ret); } else { /* * futex_lock_pi_atomic() did not acquire the user space * futex, but managed to establish the proxy lock and pi * state. top_waiter::requeue_state cannot be fixed up here * because the waiter is not enqueued on the rtmutex * yet. This is handled at the callsite depending on the * result of rt_mutex_start_proxy_lock() which is * guaranteed to be reached with this function returning 0. */ } return ret; } /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address * @flags1: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address * @flags2: futex flags (FLAGS_SHARED, etc.) * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) * @requeue_pi: if we are attempting to requeue from a non-pi futex to a * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * * Return: * - >=0 - on success, the number of tasks requeued or woken; * - <0 - on error */ int futex_requeue(u32 __user *uaddr1, unsigned int flags1, u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); if (nr_wake < 0 || nr_requeue < 0) return -EINVAL; /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past * this point which will optimize away all the conditional code * further down. */ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) return -ENOSYS; if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This * check is only valid for private futexes. See below. */ if (uaddr1 == uaddr2) return -EINVAL; /* * futex_requeue() allows the caller to define the number * of waiters to wake up via the @nr_wake argument. With * REQUEUE_PI, waking up more than one waiter is creating * more problems than it solves. Waking up a waiter makes * only sense if the PI futex @uaddr2 is uncontended as * this allows the requeue code to acquire the futex * @uaddr2 before waking the waiter. The waiter can then * return to user space without further action. A secondary * wakeup would just make the futex_wait_requeue_pi() * handling more complex, because that code would have to * look up pi_state and do more or less all the handling * which the requeue code has to do for the to be requeued * waiters. So restrict the number of waiters to wake to * one, and only wake it up when the PI futex is * uncontended. Otherwise requeue it and let the unlock of * the PI futex handle the wakeup. * * All REQUEUE_PI users, e.g. pthread_cond_signal() and * pthread_cond_broadcast() must use nr_wake=1. */ if (nr_wake != 1) return -EINVAL; /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. */ if (refill_pi_state_cache()) return -ENOMEM; } retry: ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags2, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ if (requeue_pi && futex_match(&key1, &key2)) return -EINVAL; retry_private: if (1) { CLASS(hb, hb1)(&key1); CLASS(hb, hb2)(&key2); futex_hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { u32 curval; ret = futex_get_value_locked(&curval, uaddr1); if (unlikely(ret)) { futex_hb_waiters_dec(hb2); double_unlock_hb(hb1, hb2); ret = get_user(curval, uaddr1); if (ret) return ret; if (!(flags1 & FLAGS_SHARED)) goto retry_private; goto retry; } if (curval != *cmpval) { ret = -EAGAIN; goto out_unlock; } } if (requeue_pi) { struct task_struct *exiting = NULL; /* * Attempt to acquire uaddr2 and wake the top waiter. If we * intend to requeue waiters, force setting the FUTEX_WAITERS * bit. We force this here where we are able to easily handle * faults rather in the requeue loop below. * * Updates topwaiter::requeue_state if a top waiter exists. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, &key2, &pi_state, &exiting, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or * is waiting on it. In both cases pi_state has been * established and an initial refcount on it. In case of an * error there's nothing. * * The top waiter's requeue_state is up to date: * * - If the lock was acquired atomically (ret == 1), then * the state is Q_REQUEUE_PI_LOCKED. * * The top waiter has been dequeued and woken up and can * return to user space immediately. The kernel/user * space state is consistent. In case that there must be * more waiters requeued the WAITERS bit in the user * space futex is set so the top waiter task has to go * into the syscall slowpath to unlock the futex. This * will block until this requeue operation has been * completed and the hash bucket locks have been * dropped. * * - If the trylock failed with an error (ret < 0) then * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing * happened", or Q_REQUEUE_PI_IGNORE when there was an * interleaved early wakeup. * * - If the trylock did not succeed (ret == 0) then the * state is either Q_REQUEUE_PI_IN_PROGRESS or * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. * This will be cleaned up in the loop below, which * cannot fail because futex_proxy_trylock_atomic() did * the same sanity checks for requeue_pi as the loop * below does. */ switch (ret) { case 0: /* We hold a reference on the pi state. */ break; case 1: /* * futex_proxy_trylock_atomic() acquired the user space * futex. Adjust task_count. */ task_count++; ret = 0; break; /* * If the above failed, then pi_state is NULL and * waiter::requeue_state is correct. */ case -EFAULT: futex_hb_waiters_dec(hb2); double_unlock_hb(hb1, hb2); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; return ret; case -EBUSY: case -EAGAIN: /* * Two reasons for this: * - EBUSY: Owner is exiting and we just wait for the * exit to complete. * - EAGAIN: The user space value changed. */ futex_hb_waiters_dec(hb2); double_unlock_hb(hb1, hb2); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise * this task might loop forever, aka. live lock. */ wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: goto out_unlock; } } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (task_count - nr_wake >= nr_requeue) break; if (!futex_match(&this->key, &key1)) continue; /* * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. * * We should never be requeueing a futex_q with a pi_state, * which is awaiting a futex_unlock_pi(). */ if ((requeue_pi && !this->rt_waiter) || (!requeue_pi && this->rt_waiter) || this->pi_state) { ret = -EINVAL; break; } /* Plain futexes just wake or requeue and are done */ if (!requeue_pi) { if (++task_count <= nr_wake) this->wake(&wake_q, this); else requeue_futex(this, hb1, hb2, &key2); continue; } /* Ensure we requeue to the expected futex for requeue_pi. */ if (!futex_match(this->requeue_pi_key, &key2)) { ret = -EINVAL; break; } /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. * * Prepare the waiter to take the rt_mutex. Take a refcount * on the pi_state and store the pointer in the futex_q * object of the waiter. */ get_pi_state(pi_state); /* Don't requeue when the waiter is already on the way out. */ if (!futex_requeue_pi_prepare(this, pi_state)) { /* * Early woken waiter signaled that it is on the * way out. Drop the pi_state reference and try the * next waiter. @this->pi_state is still NULL. */ put_pi_state(pi_state); continue; } ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, this->task); if (ret == 1) { /* * We got the lock. We do neither drop the refcount * on pi_state nor clear this->pi_state because the * waiter needs the pi_state for cleaning up the * user space value. It will drop the refcount * after doing so. this::requeue_state is updated * in the wakeup as well. */ requeue_pi_wake_futex(this, &key2, hb2); task_count++; } else if (!ret) { /* Waiter is queued, move it to hb2 */ requeue_futex(this, hb1, hb2, &key2); futex_requeue_pi_complete(this, 0); task_count++; } else { /* * rt_mutex_start_proxy_lock() detected a potential * deadlock when we tried to queue that waiter. * Drop the pi_state reference which we took above * and remove the pointer to the state from the * waiters futex_q object. */ this->pi_state = NULL; put_pi_state(pi_state); futex_requeue_pi_complete(this, ret); /* * We stop queueing more waiters and let user space * deal with the mess. */ break; } } /* * We took an extra initial reference to the pi_state in * futex_proxy_trylock_atomic(). We need to drop it here again. */ put_pi_state(pi_state); out_unlock: futex_hb_waiters_dec(hb2); double_unlock_hb(hb1, hb2); } wake_up_q(&wake_q); return ret ? ret : task_count; } /** * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex * @hb: the hash_bucket futex_q was original enqueued on * @q: the futex_q woken while waiting to be requeued * @timeout: the timeout associated with the wait (NULL if none) * * Determine the cause for the early wakeup. * * Return: * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { int ret; /* * With the hb lock held, we avoid races while we process the wakeup. * We only need to hold hb (and not hb2) to ensure atomicity as the * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. * It can't be requeued from uaddr2 to something else since we don't * support a PI aware source futex for requeue. */ WARN_ON_ONCE(&hb->lock != q->lock_ptr); /* * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; else if (signal_pending(current)) ret = -ERESTARTNOINTR; return ret; } /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to * userspace. This ensures the rt_mutex maintains an owner when it has waiters; * without one, the pi logic would not know which task to boost/deboost, if * there was a need to. * * We call schedule in futex_wait_queue() when we enqueue and return there * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal * 4) timeout * * If 3, cleanup and return -ERESTARTNOINTR. * * If 2, we may then block on trying to take the rt_mutex and return via: * 5) successful lock * 6) signal * 7) timeout * 8) other lock acquisition failure * * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * * Return: * - 0 - On success; * - <0 - On error */ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to; struct rt_mutex_waiter rt_waiter; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; struct rt_mutex_base *pi_mutex; int res, ret; if (!IS_ENABLED(CONFIG_FUTEX_PI)) return -ENOSYS; if (uaddr == uaddr2) return -EINVAL; if (!bitset) return -EINVAL; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); /* * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ rt_mutex_init_waiter(&rt_waiter); ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; q.bitset = bitset; q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current); if (ret) goto out; /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_do_wait(&q, to); switch (futex_requeue_pi_wakeup_sync(&q)) { case Q_REQUEUE_PI_IGNORE: { CLASS(hb, hb)(&q.key); /* The waiter is still on uaddr1 */ spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, to); spin_unlock(&hb->lock); } break; case Q_REQUEUE_PI_LOCKED: /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { futex_q_lockptr_lock(&q); ret = fixup_pi_owner(uaddr2, &q, true); /* * Drop the reference to the pi state which the * requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); /* * Adjust the return value. It's either -EFAULT or * success (1) but the caller expects 0 for success. */ ret = ret < 0 ? ret : 0; } break; case Q_REQUEUE_PI_DONE: /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); /* * See futex_unlock_pi()'s cleanup: comment. */ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; futex_q_lockptr_lock(&q); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ res = fixup_pi_owner(uaddr2, &q, !ret); /* * If fixup_pi_owner() returned an error, propagate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) ret = (res < 0) ? res : 0; futex_unqueue_pi(&q); spin_unlock(q.lock_ptr); if (ret == -EINTR) { /* * We've already been requeued, but cannot restart * by calling futex_lock_pi() directly. We could * restart this syscall, but it would detect that * the user space "val" changed and return * -EWOULDBLOCK. Save the overhead of the restart * and return -EWOULDBLOCK directly. */ ret = -EWOULDBLOCK; } break; default: BUG(); } if (q.drop_hb_ref) { CLASS(hb, hb)(&q.key); /* Additional reference from requeue_pi_wake_futex() */ futex_hash_put(hb); } out: if (to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); } return ret; } |
| 7 7 1 3 4 4 4 4 4 4 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | // SPDX-License-Identifier: GPL-2.0-or-later /* * mmap.c * * Code to deal with the mess that is clustered mmap. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/uio.h> #include <linux/signal.h> #include <linux/rbtree.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "aops.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "mmap.h" #include "super.h" #include "ocfs2_trace.h" static vm_fault_t ocfs2_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; sigset_t oldset; vm_fault_t ret; ocfs2_block_signals(&oldset); ret = filemap_fault(vmf); ocfs2_unblock_signals(&oldset); trace_ocfs2_fault(OCFS2_I(vma->vm_file->f_mapping->host)->ip_blkno, vma, vmf->page, vmf->pgoff); return ret; } static vm_fault_t __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct folio *folio) { int err; vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = folio_pos(folio); unsigned int len = PAGE_SIZE; pgoff_t last_index; struct folio *locked_folio = NULL; void *fsdata; loff_t size = i_size_read(inode); last_index = (size - 1) >> PAGE_SHIFT; /* * There are cases that lead to the page no longer belonging to the * mapping. * 1) pagecache truncates locally due to memory pressure. * 2) pagecache truncates when another is taking EX lock against * inode lock. see ocfs2_data_convert_worker. * * The i_size check doesn't catch the case where nodes truncated and * then re-extended the file. We'll re-check the page mapping after * taking the page lock inside of ocfs2_write_begin_nolock(). * * Let VM retry with these cases. */ if ((folio->mapping != inode->i_mapping) || !folio_test_uptodate(folio) || (pos >= size)) goto out; /* * Call ocfs2_write_begin() and ocfs2_write_end() to take * advantage of the allocation code there. We pass a write * length of the whole page (chopped to i_size) to make sure * the whole thing is allocated. * * Since we know the page is up to date, we don't have to * worry about ocfs2_write_begin() skipping some buffer reads * because the "write" would invalidate their data. */ if (folio->index == last_index) len = ((size - 1) & ~PAGE_MASK) + 1; err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, &locked_folio, &fsdata, di_bh, folio); if (err) { if (err != -ENOSPC) mlog_errno(err); ret = vmf_error(err); goto out; } if (!locked_folio) { ret = VM_FAULT_NOPAGE; goto out; } err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); BUG_ON(err != len); ret = VM_FAULT_LOCKED; out: return ret; } static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; int err; vm_fault_t ret; sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); /* * The cluster locks taken will block a truncate from another * node. Taking the data lock will also ensure that we don't * attempt page truncation as part of a downconvert. */ err = ocfs2_inode_lock(inode, &di_bh, 1); if (err < 0) { mlog_errno(err); ret = vmf_error(err); goto out; } /* * The alloc sem should be enough to serialize with * ocfs2_truncate_file() changing i_size as well as any thread * modifying the inode btree. */ down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, folio); up_write(&OCFS2_I(inode)->ip_alloc_sem); brelse(di_bh); ocfs2_inode_unlock(inode, 1); out: ocfs2_unblock_signals(&oldset); sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, }; int ocfs2_mmap_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), file->f_path.mnt, &lock_level, 1); if (ret < 0) { mlog_errno(ret); goto out; } ocfs2_inode_unlock(file_inode(file), lock_level); out: desc->vm_ops = &ocfs2_file_vm_ops; return 0; } |
| 46 8 69 4395 1968 9063 9199 9204 8 189 15057 15048 15051 14903 14910 14909 279 4617 69 69 10009 4430 5759 8051 2078 8325 1820 9090 1487 6201 6202 6201 4374 4373 4374 107 107 5870 1 5883 5870 5772 100 58 2643 4515 5315 810 5864 5873 5858 14 5875 4051 7815 4396 10 4091 10 10 63802 64270 3220 10 10 10 3222 3222 5411 5418 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/swap.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ static struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; union { struct path user_path; freeptr_t bf_freeptr; }; }; #define backing_file(f) container_of(f, struct backing_file, file) const struct path *backing_file_user_path(const struct file *f) { return &backing_file(f)->user_path; } EXPORT_SYMBOL_GPL(backing_file_user_path); void backing_file_set_user_path(struct file *f, const struct path *path) { backing_file(f)->user_path = *path; } EXPORT_SYMBOL_GPL(backing_file_set_user_path); static inline void file_free(struct file *f) { security_file_free(f); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Handle nr_files sysctl */ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = percpu_counter_sum_positive(&nr_files); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = SYSCTL_LONG_ZERO, .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, }; static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } return 0; } fs_initcall(init_fs_stat_sysctls); #endif static int init_file(struct file *f, int flags, const struct cred *cred) { int error; f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { put_cred(f->f_cred); return error; } spin_lock_init(&f->f_lock); /* * Note that f_pos_lock is only used for files raising * FMODE_ATOMIC_POS and directories. Other files such as pipes * don't need it and since f_pos_lock is in a union may reuse * the space for other purposes. They are expected to initialize * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); memset(&f->__f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* * Disable permission and pre-content events for all files by default. * They may be enabled later by fsnotify_open_perm_and_set_mode(). */ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); f->f_op = NULL; f->f_mapping = NULL; f->private_data = NULL; f->f_inode = NULL; f->f_owner = NULL; #ifdef CONFIG_EPOLL f->f_ep = NULL; #endif f->f_iocb_flags = 0; f->f_pos = 0; f->f_wb_err = 0; f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); return 0; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; int error; /* * Privileged users can go above max_files */ if (unlikely(get_nr_files() >= files_stat.max_files) && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } f->f_mode |= FMODE_NOACCOUNT; return f; } /* * Variant of alloc_empty_file() that allocates a backing_file container * and doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) { struct backing_file *ff; int error; ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; return &ff->file; } EXPORT_SYMBOL_GPL(alloc_empty_backing_file); /** * file_init_path - initialize a 'struct file' based on path * * @file: the file to set up * @path: the (dentry, vfsmount) pair for the new file * @fop: the 'struct file_operations' for the new file */ static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { file->__f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if (fop->llseek) file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (!IS_ERR(file)) file_init_path(file, path, fop); return file; } static inline int alloc_path_pseudo(const char *name, struct inode *inode, struct vfsmount *mnt, struct path *path) { path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); if (!path->dentry) return -ENOMEM; path->mnt = mntget(mnt); d_instantiate(path->dentry, inode); return 0; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); return file; } /* * Disable all fsnotify events for pseudo files by default. * They may be enabled by caller with file_set_fsnotify_mode(). */ file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_pseudo_noaccount(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_empty_file_noaccount(flags, current_cred()); if (IS_ERR(file)) { ihold(inode); path_put(&path); return file; } file_init_path(file, &path, fops); /* * Disable all fsnotify events for pseudo files by default. * They may be enabled by caller with file_set_fsnotify_mode(). */ file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f; f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); security_file_release(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_task_work)); } static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); static void __fput_deferred(struct file *file) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { file_free(file); return; } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_task_work, ____fput); if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } void fput(struct file *file) { if (unlikely(file_ref_put(&file->f_ref))) __fput_deferred(file); } EXPORT_SYMBOL(fput); /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (file_ref_put(&file->f_ref)) __fput(file); } EXPORT_SYMBOL(__fput_sync); /* * Equivalent to __fput_sync(), but optimized for being called with the last * reference. * * See file_ref_put_close() for details. */ void fput_close_sync(struct file *file) { if (likely(file_ref_put_close(&file->f_ref))) __fput(file); } /* * Equivalent to fput(), but optimized for being called with the last * reference. * * See file_ref_put_close() for details. */ void fput_close(struct file *file) { if (file_ref_put_close(&file->f_ref)) __fput_deferred(file); } void __init files_init(void) { struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct file, f_freeptr), }; filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); } |
| 15 15 15 15 15 15 11 12 15 11 4 1 1 1 1 1 4 11 8 1 1 1 11 15 4 11 15 15 4 15 1 4 4 4 11 6 1 1 1 1 1 8 1 2 11 15 4 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Realtek RTL28xxU DVB USB driver * * Copyright (C) 2009 Antti Palosaari <crope@iki.fi> * Copyright (C) 2011 Antti Palosaari <crope@iki.fi> * Copyright (C) 2012 Thomas Mair <thomas.mair86@googlemail.com> */ #include "rtl28xxu.h" static int rtl28xxu_disable_rc; module_param_named(disable_rc, rtl28xxu_disable_rc, int, 0644); MODULE_PARM_DESC(disable_rc, "disable RTL2832U remote controller"); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); static int rtl28xxu_ctrl_msg(struct dvb_usb_device *d, struct rtl28xxu_req *req) { struct rtl28xxu_dev *dev = d->priv; int ret; unsigned int pipe; u8 requesttype; mutex_lock(&d->usb_mutex); if (req->size > sizeof(dev->buf)) { dev_err(&d->intf->dev, "too large message %u\n", req->size); ret = -EINVAL; goto err_mutex_unlock; } if (req->index & CMD_WR_FLAG) { /* write */ memcpy(dev->buf, req->data, req->size); requesttype = (USB_TYPE_VENDOR | USB_DIR_OUT); pipe = usb_sndctrlpipe(d->udev, 0); } else { /* read */ requesttype = (USB_TYPE_VENDOR | USB_DIR_IN); /* * Zero-length transfers must use usb_sndctrlpipe() and * rtl28xxu_identify_state() uses a zero-length i2c read * command to determine the chip type. */ if (req->size) pipe = usb_rcvctrlpipe(d->udev, 0); else pipe = usb_sndctrlpipe(d->udev, 0); } ret = usb_control_msg(d->udev, pipe, 0, requesttype, req->value, req->index, dev->buf, req->size, 1000); dvb_usb_dbg_usb_control_msg(d->udev, 0, requesttype, req->value, req->index, dev->buf, req->size); if (ret < 0) goto err_mutex_unlock; /* read request, copy returned data to return buf */ if (requesttype == (USB_TYPE_VENDOR | USB_DIR_IN)) memcpy(req->data, dev->buf, req->size); mutex_unlock(&d->usb_mutex); return 0; err_mutex_unlock: mutex_unlock(&d->usb_mutex); dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl28xxu_wr_regs(struct dvb_usb_device *d, u16 reg, u8 *val, int len) { struct rtl28xxu_req req; if (reg < 0x3000) req.index = CMD_USB_WR; else if (reg < 0x4000) req.index = CMD_SYS_WR; else req.index = CMD_IR_WR; req.value = reg; req.size = len; req.data = val; return rtl28xxu_ctrl_msg(d, &req); } static int rtl28xxu_rd_regs(struct dvb_usb_device *d, u16 reg, u8 *val, int len) { struct rtl28xxu_req req; if (reg < 0x3000) req.index = CMD_USB_RD; else if (reg < 0x4000) req.index = CMD_SYS_RD; else req.index = CMD_IR_RD; req.value = reg; req.size = len; req.data = val; return rtl28xxu_ctrl_msg(d, &req); } static int rtl28xxu_wr_reg(struct dvb_usb_device *d, u16 reg, u8 val) { return rtl28xxu_wr_regs(d, reg, &val, 1); } static int rtl28xxu_rd_reg(struct dvb_usb_device *d, u16 reg, u8 *val) { return rtl28xxu_rd_regs(d, reg, val, 1); } static int rtl28xxu_wr_reg_mask(struct dvb_usb_device *d, u16 reg, u8 val, u8 mask) { int ret; u8 tmp; /* no need for read if whole reg is written */ if (mask != 0xff) { ret = rtl28xxu_rd_reg(d, reg, &tmp); if (ret) return ret; val &= mask; tmp &= ~mask; val |= tmp; } return rtl28xxu_wr_reg(d, reg, val); } /* I2C */ static int rtl28xxu_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { int ret; struct dvb_usb_device *d = i2c_get_adapdata(adap); struct rtl28xxu_dev *dev = d->priv; struct rtl28xxu_req req; /* * It is not known which are real I2C bus xfer limits, but testing * with RTL2831U + MT2060 gives max RD 24 and max WR 22 bytes. * TODO: find out RTL2832U lens */ /* * I2C adapter logic looks rather complicated due to fact it handles * three different access methods. Those methods are; * 1) integrated demod access * 2) old I2C access * 3) new I2C access * * Used method is selected in order 1, 2, 3. Method 3 can handle all * requests but there is two reasons why not use it always; * 1) It is most expensive, usually two USB messages are needed * 2) At least RTL2831U does not support it * * Method 3 is needed in case of I2C write+read (typical register read) * where write is more than one byte. */ if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; if (num == 2 && !(msg[0].flags & I2C_M_RD) && (msg[1].flags & I2C_M_RD)) { if (msg[0].len > 24 || msg[1].len > 24) { /* TODO: check msg[0].len max */ ret = -EOPNOTSUPP; goto err_mutex_unlock; } else if (msg[0].addr == 0x10) { if (msg[0].len < 1 || msg[1].len < 1) { ret = -EOPNOTSUPP; goto err_mutex_unlock; } /* method 1 - integrated demod */ if (msg[0].buf[0] == 0x00) { /* return demod page from driver cache */ msg[1].buf[0] = dev->page; ret = 0; } else { req.value = (msg[0].buf[0] << 8) | (msg[0].addr << 1); req.index = CMD_DEMOD_RD | dev->page; req.size = msg[1].len; req.data = &msg[1].buf[0]; ret = rtl28xxu_ctrl_msg(d, &req); } } else if (msg[0].len < 2) { if (msg[0].len < 1) { ret = -EOPNOTSUPP; goto err_mutex_unlock; } /* method 2 - old I2C */ req.value = (msg[0].buf[0] << 8) | (msg[0].addr << 1); req.index = CMD_I2C_RD; req.size = msg[1].len; req.data = &msg[1].buf[0]; ret = rtl28xxu_ctrl_msg(d, &req); } else { /* method 3 - new I2C */ req.value = (msg[0].addr << 1); req.index = CMD_I2C_DA_WR; req.size = msg[0].len; req.data = msg[0].buf; ret = rtl28xxu_ctrl_msg(d, &req); if (ret) goto err_mutex_unlock; req.value = (msg[0].addr << 1); req.index = CMD_I2C_DA_RD; req.size = msg[1].len; req.data = msg[1].buf; ret = rtl28xxu_ctrl_msg(d, &req); } } else if (num == 1 && !(msg[0].flags & I2C_M_RD)) { if (msg[0].len > 22) { /* TODO: check msg[0].len max */ ret = -EOPNOTSUPP; goto err_mutex_unlock; } else if (msg[0].addr == 0x10) { if (msg[0].len < 1) { ret = -EOPNOTSUPP; goto err_mutex_unlock; } /* method 1 - integrated demod */ if (msg[0].buf[0] == 0x00) { if (msg[0].len < 2) { ret = -EOPNOTSUPP; goto err_mutex_unlock; } /* save demod page for later demod access */ dev->page = msg[0].buf[1]; ret = 0; } else { req.value = (msg[0].buf[0] << 8) | (msg[0].addr << 1); req.index = CMD_DEMOD_WR | dev->page; req.size = msg[0].len-1; req.data = &msg[0].buf[1]; ret = rtl28xxu_ctrl_msg(d, &req); } } else if ((msg[0].len < 23) && (!dev->new_i2c_write)) { if (msg[0].len < 1) { ret = -EOPNOTSUPP; goto err_mutex_unlock; } /* method 2 - old I2C */ req.value = (msg[0].buf[0] << 8) | (msg[0].addr << 1); req.index = CMD_I2C_WR; req.size = msg[0].len-1; req.data = &msg[0].buf[1]; ret = rtl28xxu_ctrl_msg(d, &req); } else { /* method 3 - new I2C */ req.value = (msg[0].addr << 1); req.index = CMD_I2C_DA_WR; req.size = msg[0].len; req.data = msg[0].buf; ret = rtl28xxu_ctrl_msg(d, &req); } } else if (num == 1 && (msg[0].flags & I2C_M_RD)) { req.value = (msg[0].addr << 1); req.index = CMD_I2C_DA_RD; req.size = msg[0].len; req.data = msg[0].buf; ret = rtl28xxu_ctrl_msg(d, &req); } else { ret = -EOPNOTSUPP; } /* Retry failed I2C messages */ if (ret == -EPIPE) ret = -EAGAIN; err_mutex_unlock: mutex_unlock(&d->i2c_mutex); return ret ? ret : num; } static u32 rtl28xxu_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static const struct i2c_algorithm rtl28xxu_i2c_algo = { .master_xfer = rtl28xxu_i2c_xfer, .functionality = rtl28xxu_i2c_func, }; static int rtl2831u_read_config(struct dvb_usb_device *d) { struct rtl28xxu_dev *dev = d_to_priv(d); int ret; u8 buf[1]; /* open RTL2831U/RTL2830 I2C gate */ struct rtl28xxu_req req_gate_open = {0x0120, 0x0011, 0x0001, "\x08"}; /* tuner probes */ struct rtl28xxu_req req_mt2060 = {0x00c0, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_qt1010 = {0x0fc4, CMD_I2C_RD, 1, buf}; dev_dbg(&d->intf->dev, "\n"); /* * RTL2831U GPIOs * ========================================================= * GPIO0 | tuner#0 | 0 off | 1 on | MXL5005S (?) * GPIO2 | LED | 0 off | 1 on | * GPIO4 | tuner#1 | 0 on | 1 off | MT2060 */ /* GPIO direction */ ret = rtl28xxu_wr_reg(d, SYS_GPIO_DIR, 0x0a); if (ret) goto err; /* enable as output GPIO0, GPIO2, GPIO4 */ ret = rtl28xxu_wr_reg(d, SYS_GPIO_OUT_EN, 0x15); if (ret) goto err; /* * Probe used tuner. We need to know used tuner before demod attach * since there is some demod params needed to set according to tuner. */ /* demod needs some time to wake up */ msleep(20); dev->tuner_name = "NONE"; /* open demod I2C gate */ ret = rtl28xxu_ctrl_msg(d, &req_gate_open); if (ret) goto err; /* check QT1010 ID(?) register; reg=0f val=2c */ ret = rtl28xxu_ctrl_msg(d, &req_qt1010); if (ret == 0 && buf[0] == 0x2c) { dev->tuner = TUNER_RTL2830_QT1010; dev->tuner_name = "QT1010"; goto found; } /* open demod I2C gate */ ret = rtl28xxu_ctrl_msg(d, &req_gate_open); if (ret) goto err; /* check MT2060 ID register; reg=00 val=63 */ ret = rtl28xxu_ctrl_msg(d, &req_mt2060); if (ret == 0 && buf[0] == 0x63) { dev->tuner = TUNER_RTL2830_MT2060; dev->tuner_name = "MT2060"; goto found; } /* assume MXL5005S */ dev->tuner = TUNER_RTL2830_MXL5005S; dev->tuner_name = "MXL5005S"; goto found; found: dev_dbg(&d->intf->dev, "tuner=%s\n", dev->tuner_name); return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl2832u_read_config(struct dvb_usb_device *d) { struct rtl28xxu_dev *dev = d_to_priv(d); int ret; u8 buf[2]; /* open RTL2832U/RTL2832 I2C gate */ struct rtl28xxu_req req_gate_open = {0x0120, 0x0011, 0x0001, "\x18"}; /* close RTL2832U/RTL2832 I2C gate */ struct rtl28xxu_req req_gate_close = {0x0120, 0x0011, 0x0001, "\x10"}; /* tuner probes */ struct rtl28xxu_req req_fc0012 = {0x00c6, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_fc0013 = {0x00c6, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_mt2266 = {0x00c0, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_fc2580 = {0x01ac, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_mt2063 = {0x00c0, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_max3543 = {0x00c0, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_tua9001 = {0x7ec0, CMD_I2C_RD, 2, buf}; struct rtl28xxu_req req_mxl5007t = {0xd9c0, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_e4000 = {0x02c8, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_tda18272 = {0x00c0, CMD_I2C_RD, 2, buf}; struct rtl28xxu_req req_r820t = {0x0034, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_r828d = {0x0074, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_mn88472 = {0xff38, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_mn88473 = {0xff38, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_cxd2837er = {0xfdd8, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_si2157 = {0x00c0, CMD_I2C_RD, 1, buf}; struct rtl28xxu_req req_si2168 = {0x00c8, CMD_I2C_RD, 1, buf}; dev_dbg(&d->intf->dev, "\n"); /* enable GPIO3 and GPIO6 as output */ ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_DIR, 0x00, 0x40); if (ret) goto err; ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_EN, 0x48, 0x48); if (ret) goto err; /* * Probe used tuner. We need to know used tuner before demod attach * since there is some demod params needed to set according to tuner. */ /* open demod I2C gate */ ret = rtl28xxu_ctrl_msg(d, &req_gate_open); if (ret) goto err; dev->tuner_name = "NONE"; /* check FC0012 ID register; reg=00 val=a1 */ ret = rtl28xxu_ctrl_msg(d, &req_fc0012); if (ret == 0 && buf[0] == 0xa1) { dev->tuner = TUNER_RTL2832_FC0012; dev->tuner_name = "FC0012"; goto tuner_found; } /* check FC0013 ID register; reg=00 val=a3 */ ret = rtl28xxu_ctrl_msg(d, &req_fc0013); if (ret == 0 && buf[0] == 0xa3) { dev->tuner = TUNER_RTL2832_FC0013; dev->tuner_name = "FC0013"; goto tuner_found; } /* check MT2266 ID register; reg=00 val=85 */ ret = rtl28xxu_ctrl_msg(d, &req_mt2266); if (ret == 0 && buf[0] == 0x85) { dev->tuner = TUNER_RTL2832_MT2266; dev->tuner_name = "MT2266"; goto tuner_found; } /* check FC2580 ID register; reg=01 val=56 */ ret = rtl28xxu_ctrl_msg(d, &req_fc2580); if (ret == 0 && buf[0] == 0x56) { dev->tuner = TUNER_RTL2832_FC2580; dev->tuner_name = "FC2580"; goto tuner_found; } /* check MT2063 ID register; reg=00 val=9e || 9c */ ret = rtl28xxu_ctrl_msg(d, &req_mt2063); if (ret == 0 && (buf[0] == 0x9e || buf[0] == 0x9c)) { dev->tuner = TUNER_RTL2832_MT2063; dev->tuner_name = "MT2063"; goto tuner_found; } /* check MAX3543 ID register; reg=00 val=38 */ ret = rtl28xxu_ctrl_msg(d, &req_max3543); if (ret == 0 && buf[0] == 0x38) { dev->tuner = TUNER_RTL2832_MAX3543; dev->tuner_name = "MAX3543"; goto tuner_found; } /* check TUA9001 ID register; reg=7e val=2328 */ ret = rtl28xxu_ctrl_msg(d, &req_tua9001); if (ret == 0 && buf[0] == 0x23 && buf[1] == 0x28) { dev->tuner = TUNER_RTL2832_TUA9001; dev->tuner_name = "TUA9001"; goto tuner_found; } /* check MXL5007R ID register; reg=d9 val=14 */ ret = rtl28xxu_ctrl_msg(d, &req_mxl5007t); if (ret == 0 && buf[0] == 0x14) { dev->tuner = TUNER_RTL2832_MXL5007T; dev->tuner_name = "MXL5007T"; goto tuner_found; } /* check E4000 ID register; reg=02 val=40 */ ret = rtl28xxu_ctrl_msg(d, &req_e4000); if (ret == 0 && buf[0] == 0x40) { dev->tuner = TUNER_RTL2832_E4000; dev->tuner_name = "E4000"; goto tuner_found; } /* check TDA18272 ID register; reg=00 val=c760 */ ret = rtl28xxu_ctrl_msg(d, &req_tda18272); if (ret == 0 && (buf[0] == 0xc7 || buf[1] == 0x60)) { dev->tuner = TUNER_RTL2832_TDA18272; dev->tuner_name = "TDA18272"; goto tuner_found; } /* check R820T ID register; reg=00 val=69 */ ret = rtl28xxu_ctrl_msg(d, &req_r820t); if (ret == 0 && buf[0] == 0x69) { dev->tuner = TUNER_RTL2832_R820T; dev->tuner_name = "R820T"; goto tuner_found; } /* check R828D ID register; reg=00 val=69 */ ret = rtl28xxu_ctrl_msg(d, &req_r828d); if (ret == 0 && buf[0] == 0x69) { dev->tuner = TUNER_RTL2832_R828D; dev->tuner_name = "R828D"; goto tuner_found; } /* GPIO0 and GPIO5 to reset Si2157/Si2168 tuner and demod */ ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, 0x00, 0x21); if (ret) goto err; ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_EN, 0x00, 0x21); if (ret) goto err; msleep(50); ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, 0x21, 0x21); if (ret) goto err; ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_EN, 0x21, 0x21); if (ret) goto err; msleep(50); /* check Si2157 ID register; reg=c0 val=80 */ ret = rtl28xxu_ctrl_msg(d, &req_si2157); if (ret == 0 && ((buf[0] & 0x80) == 0x80)) { dev->tuner = TUNER_RTL2832_SI2157; dev->tuner_name = "SI2157"; goto tuner_found; } tuner_found: dev_dbg(&d->intf->dev, "tuner=%s\n", dev->tuner_name); /* probe slave demod */ if (dev->tuner == TUNER_RTL2832_R828D) { /* power off slave demod on GPIO0 to reset CXD2837ER */ ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, 0x00, 0x01); if (ret) goto err; ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_EN, 0x00, 0x01); if (ret) goto err; msleep(50); /* power on slave demod on GPIO0 */ ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, 0x01, 0x01); if (ret) goto err; ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_DIR, 0x00, 0x01); if (ret) goto err; ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_EN, 0x01, 0x01); if (ret) goto err; /* slave demod needs some time to wake up */ msleep(20); /* check slave answers */ ret = rtl28xxu_ctrl_msg(d, &req_mn88472); if (ret == 0 && buf[0] == 0x02) { dev_dbg(&d->intf->dev, "MN88472 found\n"); dev->slave_demod = SLAVE_DEMOD_MN88472; goto demod_found; } ret = rtl28xxu_ctrl_msg(d, &req_mn88473); if (ret == 0 && buf[0] == 0x03) { dev_dbg(&d->intf->dev, "MN88473 found\n"); dev->slave_demod = SLAVE_DEMOD_MN88473; goto demod_found; } ret = rtl28xxu_ctrl_msg(d, &req_cxd2837er); if (ret == 0 && buf[0] == 0xb1) { dev_dbg(&d->intf->dev, "CXD2837ER found\n"); dev->slave_demod = SLAVE_DEMOD_CXD2837ER; goto demod_found; } } if (dev->tuner == TUNER_RTL2832_SI2157) { /* check Si2168 ID register; reg=c8 val=80 */ ret = rtl28xxu_ctrl_msg(d, &req_si2168); if (ret == 0 && ((buf[0] & 0x80) == 0x80)) { dev_dbg(&d->intf->dev, "Si2168 found\n"); dev->slave_demod = SLAVE_DEMOD_SI2168; goto demod_found; } } demod_found: /* close demod I2C gate */ ret = rtl28xxu_ctrl_msg(d, &req_gate_close); if (ret < 0) goto err; return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl28xxu_read_config(struct dvb_usb_device *d) { struct rtl28xxu_dev *dev = d_to_priv(d); if (dev->chip_id == CHIP_ID_RTL2831U) return rtl2831u_read_config(d); else return rtl2832u_read_config(d); } static int rtl28xxu_identify_state(struct dvb_usb_device *d, const char **name) { struct rtl28xxu_dev *dev = d_to_priv(d); int ret; struct rtl28xxu_req req_demod_i2c = {0x0020, CMD_I2C_DA_RD, 0, NULL}; dev_dbg(&d->intf->dev, "\n"); /* * Detect chip type using I2C command that is not supported * by old RTL2831U. */ ret = rtl28xxu_ctrl_msg(d, &req_demod_i2c); if (ret == -EPIPE) { dev->chip_id = CHIP_ID_RTL2831U; } else if (ret == 0) { dev->chip_id = CHIP_ID_RTL2832U; } else { dev_err(&d->intf->dev, "chip type detection failed %d\n", ret); goto err; } dev_dbg(&d->intf->dev, "chip_id=%u\n", dev->chip_id); /* Retry failed I2C messages */ d->i2c_adap.retries = 3; d->i2c_adap.timeout = msecs_to_jiffies(10); return WARM; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static const struct rtl2830_platform_data rtl2830_mt2060_platform_data = { .clk = 28800000, .spec_inv = 1, .vtop = 0x20, .krf = 0x04, .agc_targ_val = 0x2d, }; static const struct rtl2830_platform_data rtl2830_qt1010_platform_data = { .clk = 28800000, .spec_inv = 1, .vtop = 0x20, .krf = 0x04, .agc_targ_val = 0x2d, }; static const struct rtl2830_platform_data rtl2830_mxl5005s_platform_data = { .clk = 28800000, .spec_inv = 0, .vtop = 0x3f, .krf = 0x04, .agc_targ_val = 0x3e, }; static int rtl2831u_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct rtl28xxu_dev *dev = d_to_priv(d); struct rtl2830_platform_data *pdata = &dev->rtl2830_platform_data; struct i2c_board_info board_info; struct i2c_client *client; int ret; dev_dbg(&d->intf->dev, "\n"); switch (dev->tuner) { case TUNER_RTL2830_QT1010: *pdata = rtl2830_qt1010_platform_data; break; case TUNER_RTL2830_MT2060: *pdata = rtl2830_mt2060_platform_data; break; case TUNER_RTL2830_MXL5005S: *pdata = rtl2830_mxl5005s_platform_data; break; default: dev_err(&d->intf->dev, "unknown tuner %s\n", dev->tuner_name); ret = -ENODEV; goto err; } /* attach demodulator */ memset(&board_info, 0, sizeof(board_info)); strscpy(board_info.type, "rtl2830", I2C_NAME_SIZE); board_info.addr = 0x10; board_info.platform_data = pdata; request_module("%s", board_info.type); client = i2c_new_client_device(&d->i2c_adap, &board_info); if (!i2c_client_has_driver(client)) { ret = -ENODEV; goto err; } if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); ret = -ENODEV; goto err; } adap->fe[0] = pdata->get_dvb_frontend(client); dev->demod_i2c_adapter = pdata->get_i2c_adapter(client); dev->i2c_client_demod = client; return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static const struct rtl2832_platform_data rtl2832_fc2580_platform_data = { .clk = 28800000, .tuner = TUNER_RTL2832_FC2580, }; static const struct rtl2832_platform_data rtl2832_fc0012_platform_data = { .clk = 28800000, .tuner = TUNER_RTL2832_FC0012 }; static const struct rtl2832_platform_data rtl2832_fc0013_platform_data = { .clk = 28800000, .tuner = TUNER_RTL2832_FC0013 }; static const struct rtl2832_platform_data rtl2832_tua9001_platform_data = { .clk = 28800000, .tuner = TUNER_RTL2832_TUA9001, }; static const struct rtl2832_platform_data rtl2832_e4000_platform_data = { .clk = 28800000, .tuner = TUNER_RTL2832_E4000, }; static const struct rtl2832_platform_data rtl2832_r820t_platform_data = { .clk = 28800000, .tuner = TUNER_RTL2832_R820T, }; static const struct rtl2832_platform_data rtl2832_si2157_platform_data = { .clk = 28800000, .tuner = TUNER_RTL2832_SI2157, }; static int rtl2832u_fc0012_tuner_callback(struct dvb_usb_device *d, int cmd, int arg) { int ret; u8 val; dev_dbg(&d->intf->dev, "cmd=%d arg=%d\n", cmd, arg); switch (cmd) { case FC_FE_CALLBACK_VHF_ENABLE: /* set output values */ ret = rtl28xxu_rd_reg(d, SYS_GPIO_OUT_VAL, &val); if (ret) goto err; if (arg) val &= 0xbf; /* set GPIO6 low */ else val |= 0x40; /* set GPIO6 high */ ret = rtl28xxu_wr_reg(d, SYS_GPIO_OUT_VAL, val); if (ret) goto err; break; default: ret = -EINVAL; goto err; } return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl2832u_tua9001_tuner_callback(struct dvb_usb_device *d, int cmd, int arg) { int ret; u8 val; dev_dbg(&d->intf->dev, "cmd=%d arg=%d\n", cmd, arg); /* * CEN always enabled by hardware wiring * RESETN GPIO4 * RXEN GPIO1 */ switch (cmd) { case TUA9001_CMD_RESETN: if (arg) val = (1 << 4); else val = (0 << 4); ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, val, 0x10); if (ret) goto err; break; case TUA9001_CMD_RXEN: if (arg) val = (1 << 1); else val = (0 << 1); ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, val, 0x02); if (ret) goto err; break; } return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl2832u_frontend_callback(void *adapter_priv, int component, int cmd, int arg) { struct i2c_adapter *adapter = adapter_priv; struct device *parent = adapter->dev.parent; struct i2c_adapter *parent_adapter; struct dvb_usb_device *d; struct rtl28xxu_dev *dev; /* * All tuners are connected to demod muxed I2C adapter. We have to * resolve its parent adapter in order to get handle for this driver * private data. That is a bit hackish solution, GPIO or direct driver * callback would be better... */ if (parent != NULL && parent->type == &i2c_adapter_type) parent_adapter = to_i2c_adapter(parent); else return -EINVAL; d = i2c_get_adapdata(parent_adapter); dev = d->priv; dev_dbg(&d->intf->dev, "component=%d cmd=%d arg=%d\n", component, cmd, arg); switch (component) { case DVB_FRONTEND_COMPONENT_TUNER: switch (dev->tuner) { case TUNER_RTL2832_FC0012: return rtl2832u_fc0012_tuner_callback(d, cmd, arg); case TUNER_RTL2832_TUA9001: return rtl2832u_tua9001_tuner_callback(d, cmd, arg); } } return 0; } static int rtl2832u_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct rtl28xxu_dev *dev = d_to_priv(d); struct rtl2832_platform_data *pdata = &dev->rtl2832_platform_data; struct i2c_board_info board_info; struct i2c_client *client; int ret; dev_dbg(&d->intf->dev, "\n"); switch (dev->tuner) { case TUNER_RTL2832_FC0012: *pdata = rtl2832_fc0012_platform_data; break; case TUNER_RTL2832_FC0013: *pdata = rtl2832_fc0013_platform_data; break; case TUNER_RTL2832_FC2580: *pdata = rtl2832_fc2580_platform_data; break; case TUNER_RTL2832_TUA9001: *pdata = rtl2832_tua9001_platform_data; break; case TUNER_RTL2832_E4000: *pdata = rtl2832_e4000_platform_data; break; case TUNER_RTL2832_R820T: case TUNER_RTL2832_R828D: *pdata = rtl2832_r820t_platform_data; break; case TUNER_RTL2832_SI2157: *pdata = rtl2832_si2157_platform_data; break; default: dev_err(&d->intf->dev, "unknown tuner %s\n", dev->tuner_name); ret = -ENODEV; goto err; } /* attach demodulator */ memset(&board_info, 0, sizeof(board_info)); strscpy(board_info.type, "rtl2832", I2C_NAME_SIZE); board_info.addr = 0x10; board_info.platform_data = pdata; request_module("%s", board_info.type); client = i2c_new_client_device(&d->i2c_adap, &board_info); if (!i2c_client_has_driver(client)) { ret = -ENODEV; goto err; } if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); ret = -ENODEV; goto err; } adap->fe[0] = pdata->get_dvb_frontend(client); dev->demod_i2c_adapter = pdata->get_i2c_adapter(client); dev->i2c_client_demod = client; /* set fe callback */ adap->fe[0]->callback = rtl2832u_frontend_callback; if (dev->slave_demod) { struct i2c_board_info info = {}; /* attach slave demodulator */ if (dev->slave_demod == SLAVE_DEMOD_MN88472) { struct mn88472_config mn88472_config = {}; mn88472_config.fe = &adap->fe[1]; mn88472_config.i2c_wr_max = 22; strscpy(info.type, "mn88472", I2C_NAME_SIZE); mn88472_config.xtal = 20500000; mn88472_config.ts_mode = SERIAL_TS_MODE; mn88472_config.ts_clock = VARIABLE_TS_CLOCK; info.addr = 0x18; info.platform_data = &mn88472_config; request_module(info.type); client = i2c_new_client_device(&d->i2c_adap, &info); if (!i2c_client_has_driver(client)) goto err_slave_demod_failed; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); goto err_slave_demod_failed; } dev->i2c_client_slave_demod = client; } else if (dev->slave_demod == SLAVE_DEMOD_MN88473) { struct mn88473_config mn88473_config = {}; mn88473_config.fe = &adap->fe[1]; mn88473_config.i2c_wr_max = 22; strscpy(info.type, "mn88473", I2C_NAME_SIZE); info.addr = 0x18; info.platform_data = &mn88473_config; request_module(info.type); client = i2c_new_client_device(&d->i2c_adap, &info); if (!i2c_client_has_driver(client)) goto err_slave_demod_failed; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); goto err_slave_demod_failed; } dev->i2c_client_slave_demod = client; } else if (dev->slave_demod == SLAVE_DEMOD_CXD2837ER) { struct cxd2841er_config cxd2837er_config = {}; cxd2837er_config.i2c_addr = 0xd8; cxd2837er_config.xtal = SONY_XTAL_20500; cxd2837er_config.flags = (CXD2841ER_AUTO_IFHZ | CXD2841ER_NO_AGCNEG | CXD2841ER_TSBITS | CXD2841ER_EARLY_TUNE | CXD2841ER_TS_SERIAL); adap->fe[1] = dvb_attach(cxd2841er_attach_t_c, &cxd2837er_config, &d->i2c_adap); if (!adap->fe[1]) goto err_slave_demod_failed; adap->fe[1]->id = 1; dev->i2c_client_slave_demod = NULL; } else { struct si2168_config si2168_config = {}; struct i2c_adapter *adapter; si2168_config.i2c_adapter = &adapter; si2168_config.fe = &adap->fe[1]; si2168_config.ts_mode = SI2168_TS_SERIAL; si2168_config.ts_clock_inv = false; si2168_config.ts_clock_gapped = true; strscpy(info.type, "si2168", I2C_NAME_SIZE); info.addr = 0x64; info.platform_data = &si2168_config; request_module(info.type); client = i2c_new_client_device(&d->i2c_adap, &info); if (!i2c_client_has_driver(client)) goto err_slave_demod_failed; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); goto err_slave_demod_failed; } dev->i2c_client_slave_demod = client; /* for Si2168 devices use only new I2C write method */ dev->new_i2c_write = true; } } return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; err_slave_demod_failed: /* * We continue on reduced mode, without DVB-T2/C, using master * demod, when slave demod fails. */ dev->slave_demod = SLAVE_DEMOD_NONE; return 0; } static int rtl28xxu_frontend_attach(struct dvb_usb_adapter *adap) { struct rtl28xxu_dev *dev = adap_to_priv(adap); if (dev->chip_id == CHIP_ID_RTL2831U) return rtl2831u_frontend_attach(adap); else return rtl2832u_frontend_attach(adap); } static int rtl28xxu_frontend_detach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct rtl28xxu_dev *dev = d_to_priv(d); struct i2c_client *client; dev_dbg(&d->intf->dev, "\n"); /* remove I2C slave demod */ client = dev->i2c_client_slave_demod; if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } /* remove I2C demod */ client = dev->i2c_client_demod; if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } return 0; } static struct qt1010_config rtl28xxu_qt1010_config = { .i2c_address = 0x62, /* 0xc4 */ }; static struct mt2060_config rtl28xxu_mt2060_config = { .i2c_address = 0x60, /* 0xc0 */ .clock_out = 0, }; static struct mxl5005s_config rtl28xxu_mxl5005s_config = { .i2c_address = 0x63, /* 0xc6 */ .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_C_H, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static int rtl2831u_tuner_attach(struct dvb_usb_adapter *adap) { int ret; struct dvb_usb_device *d = adap_to_d(adap); struct rtl28xxu_dev *dev = d_to_priv(d); struct dvb_frontend *fe; dev_dbg(&d->intf->dev, "\n"); switch (dev->tuner) { case TUNER_RTL2830_QT1010: fe = dvb_attach(qt1010_attach, adap->fe[0], dev->demod_i2c_adapter, &rtl28xxu_qt1010_config); break; case TUNER_RTL2830_MT2060: fe = dvb_attach(mt2060_attach, adap->fe[0], dev->demod_i2c_adapter, &rtl28xxu_mt2060_config, 1220); break; case TUNER_RTL2830_MXL5005S: fe = dvb_attach(mxl5005s_attach, adap->fe[0], dev->demod_i2c_adapter, &rtl28xxu_mxl5005s_config); break; default: fe = NULL; dev_err(&d->intf->dev, "unknown tuner %d\n", dev->tuner); } if (fe == NULL) { ret = -ENODEV; goto err; } return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static const struct fc0012_config rtl2832u_fc0012_config = { .i2c_address = 0x63, /* 0xc6 >> 1 */ .xtal_freq = FC_XTAL_28_8_MHZ, }; static const struct r820t_config rtl2832u_r820t_config = { .i2c_addr = 0x1a, .xtal = 28800000, .max_i2c_msg_len = 2, .rafael_chip = CHIP_R820T, }; static const struct r820t_config rtl2832u_r828d_config = { .i2c_addr = 0x3a, .xtal = 16000000, .max_i2c_msg_len = 2, .rafael_chip = CHIP_R828D, }; static int rtl2832u_tuner_attach(struct dvb_usb_adapter *adap) { int ret; struct dvb_usb_device *d = adap_to_d(adap); struct rtl28xxu_dev *dev = d_to_priv(d); struct dvb_frontend *fe = NULL; struct i2c_board_info info; struct i2c_client *client; struct v4l2_subdev *subdev = NULL; struct platform_device *pdev; struct rtl2832_sdr_platform_data pdata; dev_dbg(&d->intf->dev, "\n"); memset(&info, 0, sizeof(struct i2c_board_info)); memset(&pdata, 0, sizeof(pdata)); switch (dev->tuner) { case TUNER_RTL2832_FC0012: fe = dvb_attach(fc0012_attach, adap->fe[0], dev->demod_i2c_adapter, &rtl2832u_fc0012_config); /* since fc0012 includs reading the signal strength delegate * that to the tuner driver */ adap->fe[0]->ops.read_signal_strength = adap->fe[0]->ops.tuner_ops.get_rf_strength; break; case TUNER_RTL2832_FC0013: fe = dvb_attach(fc0013_attach, adap->fe[0], dev->demod_i2c_adapter, 0xc6>>1, 0, FC_XTAL_28_8_MHZ); /* fc0013 also supports signal strength reading */ adap->fe[0]->ops.read_signal_strength = adap->fe[0]->ops.tuner_ops.get_rf_strength; break; case TUNER_RTL2832_E4000: { struct e4000_config e4000_config = { .fe = adap->fe[0], .clock = 28800000, }; strscpy(info.type, "e4000", I2C_NAME_SIZE); info.addr = 0x64; info.platform_data = &e4000_config; request_module(info.type); client = i2c_new_client_device(dev->demod_i2c_adapter, &info); if (!i2c_client_has_driver(client)) break; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); break; } dev->i2c_client_tuner = client; subdev = i2c_get_clientdata(client); } break; case TUNER_RTL2832_FC2580: { struct fc2580_platform_data fc2580_pdata = { .dvb_frontend = adap->fe[0], }; struct i2c_board_info board_info = {}; strscpy(board_info.type, "fc2580", I2C_NAME_SIZE); board_info.addr = 0x56; board_info.platform_data = &fc2580_pdata; request_module("fc2580"); client = i2c_new_client_device(dev->demod_i2c_adapter, &board_info); if (!i2c_client_has_driver(client)) break; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); break; } dev->i2c_client_tuner = client; subdev = fc2580_pdata.get_v4l2_subdev(client); } break; case TUNER_RTL2832_TUA9001: { struct tua9001_platform_data tua9001_pdata = { .dvb_frontend = adap->fe[0], }; struct i2c_board_info board_info = {}; /* enable GPIO1 and GPIO4 as output */ ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_DIR, 0x00, 0x12); if (ret) goto err; ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_EN, 0x12, 0x12); if (ret) goto err; strscpy(board_info.type, "tua9001", I2C_NAME_SIZE); board_info.addr = 0x60; board_info.platform_data = &tua9001_pdata; request_module("tua9001"); client = i2c_new_client_device(dev->demod_i2c_adapter, &board_info); if (!i2c_client_has_driver(client)) break; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); break; } dev->i2c_client_tuner = client; break; } case TUNER_RTL2832_R820T: fe = dvb_attach(r820t_attach, adap->fe[0], dev->demod_i2c_adapter, &rtl2832u_r820t_config); /* Use tuner to get the signal strength */ adap->fe[0]->ops.read_signal_strength = adap->fe[0]->ops.tuner_ops.get_rf_strength; break; case TUNER_RTL2832_R828D: fe = dvb_attach(r820t_attach, adap->fe[0], dev->demod_i2c_adapter, &rtl2832u_r828d_config); adap->fe[0]->ops.read_signal_strength = adap->fe[0]->ops.tuner_ops.get_rf_strength; if (adap->fe[1]) { fe = dvb_attach(r820t_attach, adap->fe[1], dev->demod_i2c_adapter, &rtl2832u_r828d_config); adap->fe[1]->ops.read_signal_strength = adap->fe[1]->ops.tuner_ops.get_rf_strength; } break; case TUNER_RTL2832_SI2157: { struct si2157_config si2157_config = { .fe = adap->fe[0], .if_port = 0, .inversion = false, }; strscpy(info.type, "si2157", I2C_NAME_SIZE); info.addr = 0x60; info.platform_data = &si2157_config; request_module(info.type); client = i2c_new_client_device(&d->i2c_adap, &info); if (!i2c_client_has_driver(client)) break; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); break; } dev->i2c_client_tuner = client; subdev = i2c_get_clientdata(client); /* copy tuner ops for 2nd FE as tuner is shared */ if (adap->fe[1]) { adap->fe[1]->tuner_priv = adap->fe[0]->tuner_priv; memcpy(&adap->fe[1]->ops.tuner_ops, &adap->fe[0]->ops.tuner_ops, sizeof(struct dvb_tuner_ops)); } } break; default: dev_err(&d->intf->dev, "unknown tuner %d\n", dev->tuner); } if (fe == NULL && dev->i2c_client_tuner == NULL) { ret = -ENODEV; goto err; } /* register SDR */ switch (dev->tuner) { case TUNER_RTL2832_FC2580: case TUNER_RTL2832_FC0012: case TUNER_RTL2832_FC0013: case TUNER_RTL2832_E4000: case TUNER_RTL2832_R820T: case TUNER_RTL2832_R828D: pdata.clk = dev->rtl2832_platform_data.clk; pdata.tuner = dev->tuner; pdata.regmap = dev->rtl2832_platform_data.regmap; pdata.dvb_frontend = adap->fe[0]; pdata.dvb_usb_device = d; pdata.v4l2_subdev = subdev; request_module("%s", "rtl2832_sdr"); pdev = platform_device_register_data(&d->intf->dev, "rtl2832_sdr", PLATFORM_DEVID_AUTO, &pdata, sizeof(pdata)); if (IS_ERR(pdev) || pdev->dev.driver == NULL) break; dev->platform_device_sdr = pdev; break; default: dev_dbg(&d->intf->dev, "no SDR for tuner=%d\n", dev->tuner); } return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl28xxu_tuner_attach(struct dvb_usb_adapter *adap) { struct rtl28xxu_dev *dev = adap_to_priv(adap); if (dev->chip_id == CHIP_ID_RTL2831U) return rtl2831u_tuner_attach(adap); else return rtl2832u_tuner_attach(adap); } static int rtl28xxu_tuner_detach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct rtl28xxu_dev *dev = d_to_priv(d); struct i2c_client *client; struct platform_device *pdev; dev_dbg(&d->intf->dev, "\n"); /* remove platform SDR */ pdev = dev->platform_device_sdr; if (pdev) platform_device_unregister(pdev); /* remove I2C tuner */ client = dev->i2c_client_tuner; if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } return 0; } static int rtl28xxu_init(struct dvb_usb_device *d) { int ret; u8 val; dev_dbg(&d->intf->dev, "\n"); /* init USB endpoints */ ret = rtl28xxu_rd_reg(d, USB_SYSCTL_0, &val); if (ret) goto err; /* enable DMA and Full Packet Mode*/ val |= 0x09; ret = rtl28xxu_wr_reg(d, USB_SYSCTL_0, val); if (ret) goto err; /* set EPA maximum packet size to 0x0200 */ ret = rtl28xxu_wr_regs(d, USB_EPA_MAXPKT, "\x00\x02\x00\x00", 4); if (ret) goto err; /* change EPA FIFO length */ ret = rtl28xxu_wr_regs(d, USB_EPA_FIFO_CFG, "\x14\x00\x00\x00", 4); if (ret) goto err; return ret; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl2831u_power_ctrl(struct dvb_usb_device *d, int onoff) { int ret; u8 gpio, sys0, epa_ctl[2]; dev_dbg(&d->intf->dev, "onoff=%d\n", onoff); /* demod adc */ ret = rtl28xxu_rd_reg(d, SYS_SYS0, &sys0); if (ret) goto err; /* tuner power, read GPIOs */ ret = rtl28xxu_rd_reg(d, SYS_GPIO_OUT_VAL, &gpio); if (ret) goto err; dev_dbg(&d->intf->dev, "RD SYS0=%02x GPIO_OUT_VAL=%02x\n", sys0, gpio); if (onoff) { gpio |= 0x01; /* GPIO0 = 1 */ gpio &= (~0x10); /* GPIO4 = 0 */ gpio |= 0x04; /* GPIO2 = 1, LED on */ sys0 = sys0 & 0x0f; sys0 |= 0xe0; epa_ctl[0] = 0x00; /* clear stall */ epa_ctl[1] = 0x00; /* clear reset */ } else { gpio &= (~0x01); /* GPIO0 = 0 */ gpio |= 0x10; /* GPIO4 = 1 */ gpio &= (~0x04); /* GPIO2 = 1, LED off */ sys0 = sys0 & (~0xc0); epa_ctl[0] = 0x10; /* set stall */ epa_ctl[1] = 0x02; /* set reset */ } dev_dbg(&d->intf->dev, "WR SYS0=%02x GPIO_OUT_VAL=%02x\n", sys0, gpio); /* demod adc */ ret = rtl28xxu_wr_reg(d, SYS_SYS0, sys0); if (ret) goto err; /* tuner power, write GPIOs */ ret = rtl28xxu_wr_reg(d, SYS_GPIO_OUT_VAL, gpio); if (ret) goto err; /* streaming EP: stall & reset */ ret = rtl28xxu_wr_regs(d, USB_EPA_CTL, epa_ctl, 2); if (ret) goto err; if (onoff) usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, 0x81)); return ret; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl2832u_power_ctrl(struct dvb_usb_device *d, int onoff) { int ret; dev_dbg(&d->intf->dev, "onoff=%d\n", onoff); if (onoff) { /* GPIO3=1, GPIO4=0 */ ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, 0x08, 0x18); if (ret) goto err; /* suspend? */ ret = rtl28xxu_wr_reg_mask(d, SYS_DEMOD_CTL1, 0x00, 0x10); if (ret) goto err; /* enable PLL */ ret = rtl28xxu_wr_reg_mask(d, SYS_DEMOD_CTL, 0x80, 0x80); if (ret) goto err; /* disable reset */ ret = rtl28xxu_wr_reg_mask(d, SYS_DEMOD_CTL, 0x20, 0x20); if (ret) goto err; /* streaming EP: clear stall & reset */ ret = rtl28xxu_wr_regs(d, USB_EPA_CTL, "\x00\x00", 2); if (ret) goto err; ret = usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, 0x81)); if (ret) goto err; } else { /* GPIO4=1 */ ret = rtl28xxu_wr_reg_mask(d, SYS_GPIO_OUT_VAL, 0x10, 0x10); if (ret) goto err; /* disable PLL */ ret = rtl28xxu_wr_reg_mask(d, SYS_DEMOD_CTL, 0x00, 0x80); if (ret) goto err; /* streaming EP: set stall & reset */ ret = rtl28xxu_wr_regs(d, USB_EPA_CTL, "\x10\x02", 2); if (ret) goto err; } return ret; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl28xxu_power_ctrl(struct dvb_usb_device *d, int onoff) { struct rtl28xxu_dev *dev = d_to_priv(d); if (dev->chip_id == CHIP_ID_RTL2831U) return rtl2831u_power_ctrl(d, onoff); else return rtl2832u_power_ctrl(d, onoff); } static int rtl28xxu_frontend_ctrl(struct dvb_frontend *fe, int onoff) { struct dvb_usb_device *d = fe_to_d(fe); struct rtl28xxu_dev *dev = fe_to_priv(fe); struct rtl2832_platform_data *pdata = &dev->rtl2832_platform_data; int ret; u8 val; dev_dbg(&d->intf->dev, "fe=%d onoff=%d\n", fe->id, onoff); if (dev->chip_id == CHIP_ID_RTL2831U) return 0; if (fe->id == 0) { /* control internal demod ADC */ if (onoff) val = 0x48; /* enable ADC */ else val = 0x00; /* disable ADC */ ret = rtl28xxu_wr_reg_mask(d, SYS_DEMOD_CTL, val, 0x48); if (ret) goto err; } else if (fe->id == 1) { /* bypass slave demod TS through master demod */ ret = pdata->slave_ts_ctrl(dev->i2c_client_demod, onoff); if (ret) goto err; } return 0; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } #if IS_ENABLED(CONFIG_RC_CORE) static int rtl2831u_rc_query(struct dvb_usb_device *d) { int ret, i; struct rtl28xxu_dev *dev = d->priv; u8 buf[5]; u32 rc_code; static const struct rtl28xxu_reg_val rc_nec_tab[] = { { 0x3033, 0x80 }, { 0x3020, 0x43 }, { 0x3021, 0x16 }, { 0x3022, 0x16 }, { 0x3023, 0x5a }, { 0x3024, 0x2d }, { 0x3025, 0x16 }, { 0x3026, 0x01 }, { 0x3028, 0xb0 }, { 0x3029, 0x04 }, { 0x302c, 0x88 }, { 0x302e, 0x13 }, { 0x3030, 0xdf }, { 0x3031, 0x05 }, }; /* init remote controller */ if (!dev->rc_active) { for (i = 0; i < ARRAY_SIZE(rc_nec_tab); i++) { ret = rtl28xxu_wr_reg(d, rc_nec_tab[i].reg, rc_nec_tab[i].val); if (ret) goto err; } dev->rc_active = true; } ret = rtl28xxu_rd_regs(d, SYS_IRRC_RP, buf, 5); if (ret) goto err; if (buf[4] & 0x01) { enum rc_proto proto; if (buf[2] == (u8) ~buf[3]) { if (buf[0] == (u8) ~buf[1]) { /* NEC standard (16 bit) */ rc_code = RC_SCANCODE_NEC(buf[0], buf[2]); proto = RC_PROTO_NEC; } else { /* NEC extended (24 bit) */ rc_code = RC_SCANCODE_NECX(buf[0] << 8 | buf[1], buf[2]); proto = RC_PROTO_NECX; } } else { /* NEC full (32 bit) */ rc_code = RC_SCANCODE_NEC32(buf[0] << 24 | buf[1] << 16 | buf[2] << 8 | buf[3]); proto = RC_PROTO_NEC32; } rc_keydown(d->rc_dev, proto, rc_code, 0); ret = rtl28xxu_wr_reg(d, SYS_IRRC_SR, 1); if (ret) goto err; /* repeated intentionally to avoid extra keypress */ ret = rtl28xxu_wr_reg(d, SYS_IRRC_SR, 1); if (ret) goto err; } return ret; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl2831u_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc) { rc->map_name = RC_MAP_EMPTY; rc->allowed_protos = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32; rc->query = rtl2831u_rc_query; rc->interval = 400; return 0; } static int rtl2832u_rc_query(struct dvb_usb_device *d) { int ret, i, len; struct rtl28xxu_dev *dev = d->priv; struct ir_raw_event ev = {}; u8 buf[128]; static const struct rtl28xxu_reg_val_mask refresh_tab[] = { {IR_RX_IF, 0x03, 0xff}, {IR_RX_BUF_CTRL, 0x80, 0xff}, {IR_RX_CTRL, 0x80, 0xff}, }; /* init remote controller */ if (!dev->rc_active) { static const struct rtl28xxu_reg_val_mask init_tab[] = { {SYS_DEMOD_CTL1, 0x00, 0x04}, {SYS_DEMOD_CTL1, 0x00, 0x08}, {USB_CTRL, 0x20, 0x20}, {SYS_GPIO_DIR, 0x00, 0x08}, {SYS_GPIO_OUT_EN, 0x08, 0x08}, {SYS_GPIO_OUT_VAL, 0x08, 0x08}, {IR_MAX_DURATION0, 0xd0, 0xff}, {IR_MAX_DURATION1, 0x07, 0xff}, {IR_IDLE_LEN0, 0xc0, 0xff}, {IR_IDLE_LEN1, 0x00, 0xff}, {IR_GLITCH_LEN, 0x03, 0xff}, {IR_RX_CLK, 0x09, 0xff}, {IR_RX_CFG, 0x1c, 0xff}, {IR_MAX_H_TOL_LEN, 0x1e, 0xff}, {IR_MAX_L_TOL_LEN, 0x1e, 0xff}, {IR_RX_CTRL, 0x80, 0xff}, }; for (i = 0; i < ARRAY_SIZE(init_tab); i++) { ret = rtl28xxu_wr_reg_mask(d, init_tab[i].reg, init_tab[i].val, init_tab[i].mask); if (ret) goto err; } dev->rc_active = true; } ret = rtl28xxu_rd_reg(d, IR_RX_IF, &buf[0]); if (ret) goto err; if (buf[0] != 0x83) goto exit; ret = rtl28xxu_rd_reg(d, IR_RX_BC, &buf[0]); if (ret || buf[0] > sizeof(buf)) goto err; len = buf[0]; /* read raw code from hw */ ret = rtl28xxu_rd_regs(d, IR_RX_BUF, buf, len); if (ret) goto err; /* let hw receive new code */ for (i = 0; i < ARRAY_SIZE(refresh_tab); i++) { ret = rtl28xxu_wr_reg_mask(d, refresh_tab[i].reg, refresh_tab[i].val, refresh_tab[i].mask); if (ret) goto err; } /* pass data to Kernel IR decoder */ for (i = 0; i < len; i++) { ev.pulse = buf[i] >> 7; ev.duration = 51 * (buf[i] & 0x7f); ir_raw_event_store_with_filter(d->rc_dev, &ev); } /* 'flush' ir_raw_event_store_with_filter() */ ir_raw_event_handle(d->rc_dev); exit: return ret; err: dev_dbg(&d->intf->dev, "failed=%d\n", ret); return ret; } static int rtl2832u_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc) { /* disable IR interrupts in order to avoid SDR sample loss */ if (rtl28xxu_disable_rc) return rtl28xxu_wr_reg(d, IR_RX_IE, 0x00); /* load empty to enable rc */ if (!rc->map_name) rc->map_name = RC_MAP_EMPTY; rc->allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER; rc->driver_type = RC_DRIVER_IR_RAW; rc->query = rtl2832u_rc_query; rc->interval = 200; /* we program idle len to 0xc0, set timeout to one less */ rc->timeout = 0xbf * 51; return 0; } static int rtl28xxu_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc) { struct rtl28xxu_dev *dev = d_to_priv(d); if (dev->chip_id == CHIP_ID_RTL2831U) return rtl2831u_get_rc_config(d, rc); else return rtl2832u_get_rc_config(d, rc); } #else #define rtl28xxu_get_rc_config NULL #endif static int rtl28xxu_pid_filter_ctrl(struct dvb_usb_adapter *adap, int onoff) { struct rtl28xxu_dev *dev = adap_to_priv(adap); if (dev->chip_id == CHIP_ID_RTL2831U) { struct rtl2830_platform_data *pdata = &dev->rtl2830_platform_data; return pdata->pid_filter_ctrl(adap->fe[0], onoff); } else { struct rtl2832_platform_data *pdata = &dev->rtl2832_platform_data; return pdata->pid_filter_ctrl(adap->fe[0], onoff); } } static int rtl28xxu_pid_filter(struct dvb_usb_adapter *adap, int index, u16 pid, int onoff) { struct rtl28xxu_dev *dev = adap_to_priv(adap); if (dev->chip_id == CHIP_ID_RTL2831U) { struct rtl2830_platform_data *pdata = &dev->rtl2830_platform_data; return pdata->pid_filter(adap->fe[0], index, pid, onoff); } else { struct rtl2832_platform_data *pdata = &dev->rtl2832_platform_data; return pdata->pid_filter(adap->fe[0], index, pid, onoff); } } static const struct dvb_usb_device_properties rtl28xxu_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct rtl28xxu_dev), .identify_state = rtl28xxu_identify_state, .power_ctrl = rtl28xxu_power_ctrl, .frontend_ctrl = rtl28xxu_frontend_ctrl, .i2c_algo = &rtl28xxu_i2c_algo, .read_config = rtl28xxu_read_config, .frontend_attach = rtl28xxu_frontend_attach, .frontend_detach = rtl28xxu_frontend_detach, .tuner_attach = rtl28xxu_tuner_attach, .tuner_detach = rtl28xxu_tuner_detach, .init = rtl28xxu_init, .get_rc_config = rtl28xxu_get_rc_config, .num_adapters = 1, .adapter = { { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .pid_filter_ctrl = rtl28xxu_pid_filter_ctrl, .pid_filter = rtl28xxu_pid_filter, .stream = DVB_USB_STREAM_BULK(0x81, 6, 8 * 512), }, }, }; static const struct usb_device_id rtl28xxu_id_table[] = { /* RTL2831U devices: */ { DVB_USB_DEVICE(USB_VID_REALTEK, USB_PID_REALTEK_RTL2831U, &rtl28xxu_props, "Realtek RTL2831U reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_WIDEVIEW, USB_PID_FREECOM_DVBT, &rtl28xxu_props, "Freecom USB2.0 DVB-T", NULL) }, { DVB_USB_DEVICE(USB_VID_WIDEVIEW, USB_PID_FREECOM_DVBT_2, &rtl28xxu_props, "Freecom USB2.0 DVB-T", NULL) }, /* RTL2832U devices: */ { DVB_USB_DEVICE(USB_VID_REALTEK, 0x2832, &rtl28xxu_props, "Realtek RTL2832U reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_REALTEK, 0x2838, &rtl28xxu_props, "Realtek RTL2832U reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_STICK_BLACK_REV1, &rtl28xxu_props, "TerraTec Cinergy T Stick Black", RC_MAP_TERRATEC_SLIM) }, { DVB_USB_DEVICE(USB_VID_GTEK, USB_PID_DELOCK_USB2_DVBT, &rtl28xxu_props, "G-Tek Electronics Group Lifeview LV5TDLX DVB-T", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_NOXON_DAB_STICK, &rtl28xxu_props, "TerraTec NOXON DAB Stick", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_NOXON_DAB_STICK_REV2, &rtl28xxu_props, "TerraTec NOXON DAB Stick (rev 2)", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_NOXON_DAB_STICK_REV3, &rtl28xxu_props, "TerraTec NOXON DAB Stick (rev 3)", NULL) }, { DVB_USB_DEVICE(USB_VID_GTEK, USB_PID_TREKSTOR_TERRES_2_0, &rtl28xxu_props, "Trekstor DVB-T Stick Terres 2.0", NULL) }, { DVB_USB_DEVICE(USB_VID_DEXATEK, 0x1101, &rtl28xxu_props, "Dexatek DK DVB-T Dongle", NULL) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, 0x6680, &rtl28xxu_props, "DigitalNow Quad DVB-T Receiver", NULL) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV_DONGLE_MINID, &rtl28xxu_props, "Leadtek Winfast DTV Dongle Mini D", NULL) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV2000DS_PLUS, &rtl28xxu_props, "Leadtek WinFast DTV2000DS Plus", RC_MAP_LEADTEK_Y04G0051) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, 0x00d3, &rtl28xxu_props, "TerraTec Cinergy T Stick RC (Rev. 3)", NULL) }, { DVB_USB_DEVICE(USB_VID_DEXATEK, 0x1102, &rtl28xxu_props, "Dexatek DK mini DVB-T Dongle", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, 0x00d7, &rtl28xxu_props, "TerraTec Cinergy T Stick+", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, 0xd3a8, &rtl28xxu_props, "ASUS My Cinema-U3100Mini Plus V2", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, 0xd393, &rtl28xxu_props, "GIGABYTE U7300", NULL) }, { DVB_USB_DEVICE(USB_VID_DEXATEK, 0x1104, &rtl28xxu_props, "MSI DIGIVOX Micro HD", NULL) }, { DVB_USB_DEVICE(USB_VID_COMPRO, 0x0620, &rtl28xxu_props, "Compro VideoMate U620F", NULL) }, { DVB_USB_DEVICE(USB_VID_COMPRO, 0x0650, &rtl28xxu_props, "Compro VideoMate U650F", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, 0xd394, &rtl28xxu_props, "MaxMedia HU394-T", NULL) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, 0x6a03, &rtl28xxu_props, "Leadtek WinFast DTV Dongle mini", NULL) }, { DVB_USB_DEVICE(USB_VID_GTEK, USB_PID_CPYTO_REDI_PC50A, &rtl28xxu_props, "Crypto ReDi PC 50 A", NULL) }, { DVB_USB_DEVICE(USB_VID_KYE, 0x707f, &rtl28xxu_props, "Genius TVGo DVB-T03", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, 0xd395, &rtl28xxu_props, "Peak DVB-T USB", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV20_RTL2832U, &rtl28xxu_props, "Sveon STV20", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV21, &rtl28xxu_props, "Sveon STV21", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV27, &rtl28xxu_props, "Sveon STV27", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_TURBOX_DTT_2000, &rtl28xxu_props, "TURBO-X Pure TV Tuner DTT-2000", NULL) }, { DVB_USB_DEVICE(USB_VID_GTEK, USB_PID_PROLECTRIX_DV107669, &rtl28xxu_props, "PROlectrix DV107669", NULL) }, /* RTL2832P devices: */ { DVB_USB_DEVICE(USB_VID_HANFTEK, 0x0131, &rtl28xxu_props, "Astrometa DVB-T2", RC_MAP_ASTROMETA_T2HYBRID) }, { DVB_USB_DEVICE(0x5654, 0xca42, &rtl28xxu_props, "GoTView MasterHD 3", NULL) }, { } }; MODULE_DEVICE_TABLE(usb, rtl28xxu_id_table); static struct usb_driver rtl28xxu_usb_driver = { .name = KBUILD_MODNAME, .id_table = rtl28xxu_id_table, .probe = dvb_usbv2_probe, .disconnect = dvb_usbv2_disconnect, .suspend = dvb_usbv2_suspend, .resume = dvb_usbv2_resume, .reset_resume = dvb_usbv2_reset_resume, .no_dynamic_id = 1, .soft_unbind = 1, }; module_usb_driver(rtl28xxu_usb_driver); MODULE_DESCRIPTION("Realtek RTL28xxU DVB USB driver"); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_AUTHOR("Thomas Mair <thomas.mair86@googlemail.com>"); MODULE_LICENSE("GPL"); |
| 4 2 4 5 4 5 5 3 5 4 2 1 5 3 3 5 4 4 2 3 6 5 5 5 3 4 3 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | // SPDX-License-Identifier: GPL-2.0-only /* * TCP Westwood+: end-to-end bandwidth estimation for TCP * * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 * * Support at http://c3lab.poliba.it/index.php/Westwood * Main references in literature: * * - Mascolo S, Casetti, M. Gerla et al. * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 * * - A. Grieco, s. Mascolo * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer * Comm. Review, 2004 * * - A. Dell'Aera, L. Grieco, S. Mascolo. * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 * * Westwood+ employs end-to-end bandwidth measurement to set cwnd and * ssthresh after packet loss. The probing phase is as the original Reno. */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> /* TCP Westwood structure */ struct westwood { u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ u32 bw_est; /* bandwidth estimate */ u32 rtt_win_sx; /* here starts a new evaluation... */ u32 bk; u32 snd_una; /* used for evaluating the number of acked bytes */ u32 cumul_ack; u32 accounted; u32 rtt; u32 rtt_min; /* minimum observed RTT */ u8 first_ack; /* flag which infers that this is the first ack */ u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; /* TCP Westwood functions and constants */ #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ /* * @tcp_westwood_create * This function initializes fields used in TCP Westwood+, * it is called after the initial SYN, so the sequence numbers * are correct but new passive connections we have no * information about RTTmin at this time so we simply set it to * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative * since in this way we're sure it will be updated in a consistent * way as soon as possible. It will reasonably happen within the first * RTT period of the connection lifetime. */ static void tcp_westwood_init(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); w->bk = 0; w->bw_ns_est = 0; w->bw_est = 0; w->accounted = 0; w->cumul_ack = 0; w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_jiffies32; w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 1; } /* * @westwood_do_filter * Low-pass filter. Implemented using constant coefficients. */ static inline u32 westwood_do_filter(u32 a, u32 b) { return ((7 * a) + b) >> 3; } static void westwood_filter(struct westwood *w, u32 delta) { /* If the filter is empty fill it with the first sample of bandwidth */ if (w->bw_ns_est == 0 && w->bw_est == 0) { w->bw_ns_est = w->bk / delta; w->bw_est = w->bw_ns_est; } else { w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); } } /* * @westwood_pkts_acked * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ static void tcp_westwood_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct westwood *w = inet_csk_ca(sk); if (sample->rtt_us > 0) w->rtt = usecs_to_jiffies(sample->rtt_us); } /* * @westwood_update_window * It updates RTT evaluation window if it is the right moment to do * it. If so it calls filter for evaluating bandwidth. */ static void westwood_update_window(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_jiffies32 - w->rtt_win_sx; /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first * bandwidth sample */ if (w->first_ack) { w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 0; } /* * See if a RTT-window has passed. * Be careful since if RTT is less than * 50ms we don't filter but we continue 'building the sample'. * This minimum limit was chosen since an estimation on small * time intervals is better to avoid... * Obviously on a LAN we reasonably will always have * right_bound = left_bound + WESTWOOD_RTT_MIN */ if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { westwood_filter(w, delta); w->bk = 0; w->rtt_win_sx = tcp_jiffies32; } } static inline void update_rtt_min(struct westwood *w) { if (w->reset_rtt_min) { w->rtt_min = w->rtt; w->reset_rtt_min = 0; } else w->rtt_min = min(w->rtt, w->rtt_min); } /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when * header prediction is successful. In such case in fact update is * straight forward and doesn't need any particular care. */ static inline void westwood_fast_bw(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); westwood_update_window(sk); w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; update_rtt_min(w); } /* * @westwood_acked_count * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. */ static inline u32 westwood_acked_count(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); w->cumul_ack = tp->snd_una - w->snd_una; /* If cumul_ack is 0 this is a dupack since it's not moving * tp->snd_una. */ if (!w->cumul_ack) { w->accounted += tp->mss_cache; w->cumul_ack = tp->mss_cache; } if (w->cumul_ack > tp->mss_cache) { /* Partial or delayed ack */ if (w->accounted >= w->cumul_ack) { w->accounted -= w->cumul_ack; w->cumul_ack = tp->mss_cache; } else { w->cumul_ack -= w->accounted; w->accounted = 0; } } w->snd_una = tp->snd_una; return w->cumul_ack; } /* * TCP Westwood * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct westwood *w = inet_csk_ca(sk); return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) { if (ack_flags & CA_ACK_SLOWPATH) { struct westwood *w = inet_csk_ca(sk); westwood_update_window(sk); w->bk += westwood_acked_count(sk); update_rtt_min(w); return; } westwood_fast_bw(sk); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { case CA_EVENT_COMPLETE_CWR: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); tcp_snd_cwnd_set(tp, tp->snd_ssthresh); break; case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; default: /* don't care */ break; } } /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = 1; info->vegas.tcpv_rttcnt = 0; info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt); info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); } return 0; } static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = tcp_westwood_event, .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, .owner = THIS_MODULE, .name = "westwood" }; static int __init tcp_westwood_register(void) { BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_westwood); } static void __exit tcp_westwood_unregister(void) { tcp_unregister_congestion_control(&tcp_westwood); } module_init(tcp_westwood_register); module_exit(tcp_westwood_unregister); MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Westwood+"); |
| 6 6 6 4 4 4 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 | // SPDX-License-Identifier: GPL-2.0-or-later /* * usbusx2y.c - ALSA USB US-428 Driver * 2005-04-14 Karsten Wiese Version 0.8.7.2: Call snd_card_free() instead of snd_card_free_in_thread() to prevent oops with dead keyboard symptom. Tested ok with kernel 2.6.12-rc2. 2004-12-14 Karsten Wiese Version 0.8.7.1: snd_pcm_open for rawusb pcm-devices now returns -EBUSY if called without rawusb's hwdep device being open. 2004-12-02 Karsten Wiese Version 0.8.7: Use macro usb_maxpacket() for portability. 2004-10-26 Karsten Wiese Version 0.8.6: wake_up() process waiting in usx2y_urbs_start() on error. 2004-10-21 Karsten Wiese Version 0.8.5: nrpacks is runtime or compiletime configurable now with tested values from 1 to 4. 2004-10-03 Karsten Wiese Version 0.8.2: Avoid any possible racing while in prepare callback. 2004-09-30 Karsten Wiese Version 0.8.0: Simplified things and made ohci work again. 2004-09-20 Karsten Wiese Version 0.7.3: Use usb_kill_urb() instead of deprecated (kernel 2.6.9) usb_unlink_urb(). 2004-07-13 Karsten Wiese Version 0.7.1: Don't sleep in START/STOP callbacks anymore. us428 channels C/D not handled just for this version, sorry. 2004-06-21 Karsten Wiese Version 0.6.4: Temporarely suspend midi input to sanely call usb_set_interface() when setting format. 2004-06-12 Karsten Wiese Version 0.6.3: Made it thus the following rule is enforced: "All pcm substreams of one usx2y have to operate at the same rate & format." 2004-04-06 Karsten Wiese Version 0.6.0: Runs on 2.6.5 kernel without any "--with-debug=" things. us224 reported running. 2004-01-14 Karsten Wiese Version 0.5.1: Runs with 2.6.1 kernel. 2003-12-30 Karsten Wiese Version 0.4.1: Fix 24Bit 4Channel capturing for the us428. 2003-11-27 Karsten Wiese, Martin Langer Version 0.4: us122 support. us224 could be tested by uncommenting the sections containing USB_ID_US224 2003-11-03 Karsten Wiese Version 0.3: 24Bit support. "arecord -D hw:1 -c 2 -r 48000 -M -f S24_3LE|aplay -D hw:1 -c 2 -r 48000 -M -f S24_3LE" works. 2003-08-22 Karsten Wiese Version 0.0.8: Removed EZUSB Firmware. First Stage Firmwaredownload is now done by tascam-firmware downloader. See: http://usb-midi-fw.sourceforge.net/tascam-firmware.tar.gz 2003-06-18 Karsten Wiese Version 0.0.5: changed to compile with kernel 2.4.21 and alsa 0.9.4 2002-10-16 Karsten Wiese Version 0.0.4: compiles again with alsa-current. USB_ISO_ASAP not used anymore (most of the time), instead urb->start_frame is calculated here now, some calls inside usb-driver don't need to happen anymore. To get the best out of this: Disable APM-support in the kernel as APM-BIOS calls (once each second) hard disable interrupt for many precious milliseconds. This helped me much on my slowish PII 400 & PIII 500. ACPI yet untested but might cause the same bad behaviour. Use a kernel with lowlatency and preemptiv patches applied. To autoload snd-usb-midi append a line post-install snd-usb-us428 modprobe snd-usb-midi to /etc/modules.conf. known problems: sliders, knobs, lights not yet handled except MASTER Volume slider. "pcm -c 2" doesn't work. "pcm -c 2 -m direct_interleaved" does. KDE3: "Enable full duplex operation" deadlocks. 2002-08-31 Karsten Wiese Version 0.0.3: audio also simplex; simplifying: iso urbs only 1 packet, melted structs. ASYNC_UNLINK not used anymore: no more crashes so far..... for alsa 0.9 rc3. 2002-08-09 Karsten Wiese Version 0.0.2: midi works with snd-usb-midi, audio (only fullduplex now) with i.e. bristol. The firmware has been sniffed from win2k us-428 driver 3.09. * Copyright (c) 2002 - 2004 Karsten Wiese */ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/initval.h> #include <sound/pcm.h> #include <sound/rawmidi.h> #include "usx2y.h" #include "usbusx2y.h" #include "usX2Yhwdep.h" MODULE_AUTHOR("Karsten Wiese <annabellesgarden@yahoo.de>"); MODULE_DESCRIPTION("TASCAM "NAME_ALLCAPS" Version 0.8.7.2"); MODULE_LICENSE("GPL"); static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for this card */ static bool enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP; /* Enable this card */ module_param_array(index, int, NULL, 0444); MODULE_PARM_DESC(index, "Index value for "NAME_ALLCAPS"."); module_param_array(id, charp, NULL, 0444); MODULE_PARM_DESC(id, "ID string for "NAME_ALLCAPS"."); module_param_array(enable, bool, NULL, 0444); MODULE_PARM_DESC(enable, "Enable "NAME_ALLCAPS"."); static int snd_usx2y_card_used[SNDRV_CARDS]; static void snd_usx2y_card_private_free(struct snd_card *card); static void usx2y_unlinkseq(struct snd_usx2y_async_seq *s); #ifdef USX2Y_NRPACKS_VARIABLE int nrpacks = USX2Y_NRPACKS; /* number of packets per urb */ module_param(nrpacks, int, 0444); MODULE_PARM_DESC(nrpacks, "Number of packets per URB."); #endif /* * pipe 4 is used for switching the lamps, setting samplerate, volumes .... */ static void i_usx2y_out04_int(struct urb *urb) { #ifdef CONFIG_SND_DEBUG if (urb->status) { int i; struct usx2ydev *usx2y = urb->context; for (i = 0; i < 10 && usx2y->as04.urb[i] != urb; i++) ; dev_dbg(&urb->dev->dev, "%s urb %i status=%i\n", __func__, i, urb->status); } #endif } static void i_usx2y_in04_int(struct urb *urb) { int err = 0; struct usx2ydev *usx2y = urb->context; struct us428ctls_sharedmem *us428ctls = usx2y->us428ctls_sharedmem; struct us428_p4out *p4out; int i, j, n, diff, send; usx2y->in04_int_calls++; if (urb->status) { dev_dbg(&urb->dev->dev, "Interrupt Pipe 4 came back with status=%i\n", urb->status); return; } if (us428ctls) { diff = -1; if (us428ctls->ctl_snapshot_last == -2) { diff = 0; memcpy(usx2y->in04_last, usx2y->in04_buf, sizeof(usx2y->in04_last)); us428ctls->ctl_snapshot_last = -1; } else { for (i = 0; i < 21; i++) { if (usx2y->in04_last[i] != ((char *)usx2y->in04_buf)[i]) { if (diff < 0) diff = i; usx2y->in04_last[i] = ((char *)usx2y->in04_buf)[i]; } } } if (diff >= 0) { n = us428ctls->ctl_snapshot_last + 1; if (n >= N_US428_CTL_BUFS || n < 0) n = 0; memcpy(us428ctls->ctl_snapshot + n, usx2y->in04_buf, sizeof(us428ctls->ctl_snapshot[0])); us428ctls->ctl_snapshot_differs_at[n] = diff; us428ctls->ctl_snapshot_last = n; wake_up(&usx2y->us428ctls_wait_queue_head); } } if (usx2y->us04) { if (!usx2y->us04->submitted) { do { err = usb_submit_urb(usx2y->us04->urb[usx2y->us04->submitted++], GFP_ATOMIC); } while (!err && usx2y->us04->submitted < usx2y->us04->len); } } else { if (us428ctls && us428ctls->p4out_last >= 0 && us428ctls->p4out_last < N_US428_P4OUT_BUFS) { if (us428ctls->p4out_last != us428ctls->p4out_sent) { send = us428ctls->p4out_sent + 1; if (send >= N_US428_P4OUT_BUFS) send = 0; for (j = 0; j < URBS_ASYNC_SEQ && !err; ++j) { if (!usx2y->as04.urb[j]->status) { p4out = us428ctls->p4out + send; // FIXME if more than 1 p4out is new, 1 gets lost. usb_fill_bulk_urb(usx2y->as04.urb[j], usx2y->dev, usb_sndbulkpipe(usx2y->dev, 0x04), &p4out->val.vol, p4out->type == ELT_LIGHT ? sizeof(struct us428_lights) : 5, i_usx2y_out04_int, usx2y); err = usb_submit_urb(usx2y->as04.urb[j], GFP_ATOMIC); us428ctls->p4out_sent = send; break; } } } } } if (err) dev_err(&urb->dev->dev, "in04_int() usb_submit_urb err=%i\n", err); urb->dev = usx2y->dev; usb_submit_urb(urb, GFP_ATOMIC); } /* * Prepare some urbs */ int usx2y_async_seq04_init(struct usx2ydev *usx2y) { int err = 0, i; if (WARN_ON(usx2y->as04.buffer)) return -EBUSY; usx2y->as04.buffer = kmalloc_array(URBS_ASYNC_SEQ, URB_DATA_LEN_ASYNC_SEQ, GFP_KERNEL); if (!usx2y->as04.buffer) { err = -ENOMEM; } else { for (i = 0; i < URBS_ASYNC_SEQ; ++i) { usx2y->as04.urb[i] = usb_alloc_urb(0, GFP_KERNEL); if (!usx2y->as04.urb[i]) { err = -ENOMEM; break; } usb_fill_bulk_urb(usx2y->as04.urb[i], usx2y->dev, usb_sndbulkpipe(usx2y->dev, 0x04), usx2y->as04.buffer + URB_DATA_LEN_ASYNC_SEQ * i, 0, i_usx2y_out04_int, usx2y); err = usb_urb_ep_type_check(usx2y->as04.urb[i]); if (err < 0) break; } } if (err) usx2y_unlinkseq(&usx2y->as04); return err; } int usx2y_in04_init(struct usx2ydev *usx2y) { int err; if (WARN_ON(usx2y->in04_urb)) return -EBUSY; usx2y->in04_urb = usb_alloc_urb(0, GFP_KERNEL); if (!usx2y->in04_urb) { err = -ENOMEM; goto error; } usx2y->in04_buf = kmalloc(21, GFP_KERNEL); if (!usx2y->in04_buf) { err = -ENOMEM; goto error; } init_waitqueue_head(&usx2y->in04_wait_queue); usb_fill_int_urb(usx2y->in04_urb, usx2y->dev, usb_rcvintpipe(usx2y->dev, 0x4), usx2y->in04_buf, 21, i_usx2y_in04_int, usx2y, 10); if (usb_urb_ep_type_check(usx2y->in04_urb)) { err = -EINVAL; goto error; } return usb_submit_urb(usx2y->in04_urb, GFP_KERNEL); error: kfree(usx2y->in04_buf); usb_free_urb(usx2y->in04_urb); usx2y->in04_buf = NULL; usx2y->in04_urb = NULL; return err; } static void usx2y_unlinkseq(struct snd_usx2y_async_seq *s) { int i; for (i = 0; i < URBS_ASYNC_SEQ; ++i) { if (!s->urb[i]) continue; usb_kill_urb(s->urb[i]); usb_free_urb(s->urb[i]); s->urb[i] = NULL; } kfree(s->buffer); s->buffer = NULL; } static const struct usb_device_id snd_usx2y_usb_id_table[] = { { .match_flags = USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x1604, .idProduct = USB_ID_US428 }, { .match_flags = USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x1604, .idProduct = USB_ID_US122 }, { .match_flags = USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x1604, .idProduct = USB_ID_US224 }, { /* terminator */ } }; MODULE_DEVICE_TABLE(usb, snd_usx2y_usb_id_table); static int usx2y_create_card(struct usb_device *device, struct usb_interface *intf, struct snd_card **cardp) { int dev; struct snd_card *card; int err; for (dev = 0; dev < SNDRV_CARDS; ++dev) if (enable[dev] && !snd_usx2y_card_used[dev]) break; if (dev >= SNDRV_CARDS) return -ENODEV; err = snd_card_new(&intf->dev, index[dev], id[dev], THIS_MODULE, sizeof(struct usx2ydev), &card); if (err < 0) return err; snd_usx2y_card_used[usx2y(card)->card_index = dev] = 1; card->private_free = snd_usx2y_card_private_free; usx2y(card)->dev = device; init_waitqueue_head(&usx2y(card)->prepare_wait_queue); init_waitqueue_head(&usx2y(card)->us428ctls_wait_queue_head); mutex_init(&usx2y(card)->pcm_mutex); INIT_LIST_HEAD(&usx2y(card)->midi_list); strscpy(card->driver, "USB "NAME_ALLCAPS""); sprintf(card->shortname, "TASCAM "NAME_ALLCAPS""); sprintf(card->longname, "%s (%x:%x if %d at %03d/%03d)", card->shortname, le16_to_cpu(device->descriptor.idVendor), le16_to_cpu(device->descriptor.idProduct), 0,//us428(card)->usbmidi.ifnum, usx2y(card)->dev->bus->busnum, usx2y(card)->dev->devnum); *cardp = card; return 0; } static void snd_usx2y_card_private_free(struct snd_card *card) { struct usx2ydev *usx2y = usx2y(card); kfree(usx2y->in04_buf); usb_free_urb(usx2y->in04_urb); if (usx2y->us428ctls_sharedmem) free_pages_exact(usx2y->us428ctls_sharedmem, US428_SHAREDMEM_PAGES); if (usx2y->card_index >= 0 && usx2y->card_index < SNDRV_CARDS) snd_usx2y_card_used[usx2y->card_index] = 0; } static void snd_usx2y_disconnect(struct usb_interface *intf) { struct snd_card *card; struct usx2ydev *usx2y; struct list_head *p; card = usb_get_intfdata(intf); if (!card) return; usx2y = usx2y(card); usx2y->chip_status = USX2Y_STAT_CHIP_HUP; usx2y_unlinkseq(&usx2y->as04); usb_kill_urb(usx2y->in04_urb); snd_card_disconnect(card); /* release the midi resources */ list_for_each(p, &usx2y->midi_list) { snd_usbmidi_disconnect(p); } if (usx2y->us428ctls_sharedmem) wake_up(&usx2y->us428ctls_wait_queue_head); snd_card_free_when_closed(card); } static int snd_usx2y_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *device = interface_to_usbdev(intf); struct snd_card *card; int err; #ifdef USX2Y_NRPACKS_VARIABLE if (nrpacks < 0 || nrpacks > USX2Y_NRPACKS_MAX) return -EINVAL; #endif if (le16_to_cpu(device->descriptor.idVendor) != 0x1604 || (le16_to_cpu(device->descriptor.idProduct) != USB_ID_US122 && le16_to_cpu(device->descriptor.idProduct) != USB_ID_US224 && le16_to_cpu(device->descriptor.idProduct) != USB_ID_US428)) return -EINVAL; err = usx2y_create_card(device, intf, &card); if (err < 0) return err; err = usx2y_hwdep_new(card, device); if (err < 0) goto error; err = snd_card_register(card); if (err < 0) goto error; dev_set_drvdata(&intf->dev, card); return 0; error: snd_card_free(card); return err; } static struct usb_driver snd_usx2y_usb_driver = { .name = "snd-usb-usx2y", .probe = snd_usx2y_probe, .disconnect = snd_usx2y_disconnect, .id_table = snd_usx2y_usb_id_table, }; module_usb_driver(snd_usx2y_usb_driver); |
| 4 4 4 4 4 4 4 4 1 3 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 | // SPDX-License-Identifier: GPL-2.0-or-later /* * DVB USB Linux driver for Afatech AF9015 DVB-T USB2.0 receiver * * Copyright (C) 2007 Antti Palosaari <crope@iki.fi> * * Thanks to Afatech who kindly provided information. */ #include "af9015.h" static int dvb_usb_af9015_remote; module_param_named(remote, dvb_usb_af9015_remote, int, 0644); MODULE_PARM_DESC(remote, "select remote"); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); static int af9015_ctrl_msg(struct dvb_usb_device *d, struct req_t *req) { #define REQ_HDR_LEN 8 /* send header size */ #define ACK_HDR_LEN 2 /* rece header size */ struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, wlen, rlen; u8 write = 1; mutex_lock(&d->usb_mutex); state->buf[0] = req->cmd; state->buf[1] = state->seq++; state->buf[2] = req->i2c_addr << 1; state->buf[3] = req->addr >> 8; state->buf[4] = req->addr & 0xff; state->buf[5] = req->mbox; state->buf[6] = req->addr_len; state->buf[7] = req->data_len; switch (req->cmd) { case GET_CONFIG: case READ_MEMORY: case RECONNECT_USB: write = 0; break; case READ_I2C: write = 0; state->buf[2] |= 0x01; /* set I2C direction */ fallthrough; case WRITE_I2C: state->buf[0] = READ_WRITE_I2C; break; case WRITE_MEMORY: if (((req->addr & 0xff00) == 0xff00) || ((req->addr & 0xff00) == 0xae00)) state->buf[0] = WRITE_VIRTUAL_MEMORY; break; case WRITE_VIRTUAL_MEMORY: case COPY_FIRMWARE: case DOWNLOAD_FIRMWARE: case BOOT: break; default: dev_err(&intf->dev, "unknown cmd %d\n", req->cmd); ret = -EIO; goto error; } /* Buffer overflow check */ if ((write && (req->data_len > BUF_LEN - REQ_HDR_LEN)) || (!write && (req->data_len > BUF_LEN - ACK_HDR_LEN))) { dev_err(&intf->dev, "too much data, cmd %u, len %u\n", req->cmd, req->data_len); ret = -EINVAL; goto error; } /* * Write receives seq + status = 2 bytes * Read receives seq + status + data = 2 + N bytes */ wlen = REQ_HDR_LEN; rlen = ACK_HDR_LEN; if (write) { wlen += req->data_len; memcpy(&state->buf[REQ_HDR_LEN], req->data, req->data_len); } else { rlen += req->data_len; } /* no ack for these packets */ if (req->cmd == DOWNLOAD_FIRMWARE || req->cmd == RECONNECT_USB) rlen = 0; ret = dvb_usbv2_generic_rw_locked(d, state->buf, wlen, state->buf, rlen); if (ret) goto error; /* check status */ if (rlen && state->buf[1]) { dev_err(&intf->dev, "cmd failed %u\n", state->buf[1]); ret = -EIO; goto error; } /* read request, copy returned data to return buf */ if (!write) memcpy(req->data, &state->buf[ACK_HDR_LEN], req->data_len); error: mutex_unlock(&d->usb_mutex); return ret; } static int af9015_write_reg_i2c(struct dvb_usb_device *d, u8 addr, u16 reg, u8 val) { struct af9015_state *state = d_to_priv(d); struct req_t req = {WRITE_I2C, addr, reg, 1, 1, 1, &val}; if (addr == state->af9013_i2c_addr[0] || addr == state->af9013_i2c_addr[1]) req.addr_len = 3; return af9015_ctrl_msg(d, &req); } static int af9015_read_reg_i2c(struct dvb_usb_device *d, u8 addr, u16 reg, u8 *val) { struct af9015_state *state = d_to_priv(d); struct req_t req = {READ_I2C, addr, reg, 0, 1, 1, val}; if (addr == state->af9013_i2c_addr[0] || addr == state->af9013_i2c_addr[1]) req.addr_len = 3; return af9015_ctrl_msg(d, &req); } static int af9015_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u16 addr; u8 mbox, addr_len; struct req_t req; /* * I2C multiplexing: * There could be two tuners, both using same I2C address. Demodulator * I2C-gate is only possibility to select correct tuner. * * ........................................... * . AF9015 integrates AF9013 demodulator . * . ____________ ____________ . ____________ * .| USB IF | | demod |. | tuner | * .|------------| |------------|. |------------| * .| AF9015 | | AF9013 |. | MXL5003 | * .| |--+--I2C-----|-----/ -----|.----I2C-----| | * .| | | | addr 0x1c |. | addr 0x63 | * .|____________| | |____________|. |____________| * .................|......................... * | ____________ ____________ * | | demod | | tuner | * | |------------| |------------| * | | AF9013 | | MXL5003 | * +--I2C-----|-----/ -----|-----I2C-----| | * | addr 0x1d | | addr 0x63 | * |____________| |____________| */ if (msg[0].len == 0 || msg[0].flags & I2C_M_RD) { addr = 0x0000; mbox = 0; addr_len = 0; } else if (msg[0].len == 1) { addr = msg[0].buf[0]; mbox = 0; addr_len = 1; } else if (msg[0].len == 2) { addr = msg[0].buf[0] << 8 | msg[0].buf[1] << 0; mbox = 0; addr_len = 2; } else { addr = msg[0].buf[0] << 8 | msg[0].buf[1] << 0; mbox = msg[0].buf[2]; addr_len = 3; } if (num == 1 && !(msg[0].flags & I2C_M_RD)) { /* i2c write */ if (msg[0].len > 21) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) req.cmd = WRITE_MEMORY; else req.cmd = WRITE_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[0].len - addr_len; req.data = &msg[0].buf[addr_len]; ret = af9015_ctrl_msg(d, &req); } else if (num == 2 && !(msg[0].flags & I2C_M_RD) && (msg[1].flags & I2C_M_RD)) { /* i2c write + read */ if (msg[0].len > 3 || msg[1].len > 61) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) req.cmd = READ_MEMORY; else req.cmd = READ_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[1].len; req.data = &msg[1].buf[0]; ret = af9015_ctrl_msg(d, &req); } else if (num == 1 && (msg[0].flags & I2C_M_RD)) { /* i2c read */ if (msg[0].len > 61) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) { ret = -EINVAL; goto err; } req.cmd = READ_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[0].len; req.data = &msg[0].buf[0]; ret = af9015_ctrl_msg(d, &req); } else { ret = -EOPNOTSUPP; dev_dbg(&intf->dev, "unknown msg, num %u\n", num); } if (ret) goto err; return num; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static u32 af9015_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static const struct i2c_algorithm af9015_i2c_algo = { .master_xfer = af9015_i2c_xfer, .functionality = af9015_i2c_func, }; static int af9015_identify_state(struct dvb_usb_device *d, const char **name) { struct usb_interface *intf = d->intf; int ret; u8 reply; struct req_t req = {GET_CONFIG, 0, 0, 0, 0, 1, &reply}; ret = af9015_ctrl_msg(d, &req); if (ret) return ret; dev_dbg(&intf->dev, "reply %02x\n", reply); if (reply == 0x02) ret = WARM; else ret = COLD; return ret; } static int af9015_download_firmware(struct dvb_usb_device *d, const struct firmware *firmware) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, i, rem; struct req_t req = {DOWNLOAD_FIRMWARE, 0, 0, 0, 0, 0, NULL}; u16 checksum; dev_dbg(&intf->dev, "\n"); /* Calc checksum, we need it when copy firmware to slave demod */ for (i = 0, checksum = 0; i < firmware->size; i++) checksum += firmware->data[i]; state->firmware_size = firmware->size; state->firmware_checksum = checksum; #define LEN_MAX (BUF_LEN - REQ_HDR_LEN) /* Max payload size */ for (rem = firmware->size; rem > 0; rem -= LEN_MAX) { req.data_len = min(LEN_MAX, rem); req.data = (u8 *)&firmware->data[firmware->size - rem]; req.addr = 0x5100 + firmware->size - rem; ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware download failed %d\n", ret); goto err; } } req.cmd = BOOT; req.data_len = 0; ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware boot failed %d\n", ret); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } #define AF9015_EEPROM_SIZE 256 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME_32 0x9e370001UL /* hash (and dump) eeprom */ static int af9015_eeprom_hash(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, i; u8 buf[AF9015_EEPROM_SIZE]; struct req_t req = {READ_I2C, AF9015_I2C_EEPROM, 0, 0, 1, 1, NULL}; /* read eeprom */ for (i = 0; i < AF9015_EEPROM_SIZE; i++) { req.addr = i; req.data = &buf[i]; ret = af9015_ctrl_msg(d, &req); if (ret < 0) goto err; } /* calculate checksum */ for (i = 0; i < AF9015_EEPROM_SIZE / sizeof(u32); i++) { state->eeprom_sum *= GOLDEN_RATIO_PRIME_32; state->eeprom_sum += le32_to_cpu(((__le32 *)buf)[i]); } for (i = 0; i < AF9015_EEPROM_SIZE; i += 16) dev_dbg(&intf->dev, "%*ph\n", 16, buf + i); dev_dbg(&intf->dev, "eeprom sum %.8x\n", state->eeprom_sum); return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_read_config(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u8 val, i, offset = 0; struct req_t req = {READ_I2C, AF9015_I2C_EEPROM, 0, 0, 1, 1, &val}; dev_dbg(&intf->dev, "\n"); /* IR remote controller */ req.addr = AF9015_EEPROM_IR_MODE; /* first message will timeout often due to possible hw bug */ for (i = 0; i < 4; i++) { ret = af9015_ctrl_msg(d, &req); if (!ret) break; } if (ret) goto error; ret = af9015_eeprom_hash(d); if (ret) goto error; state->ir_mode = val; dev_dbg(&intf->dev, "ir mode %02x\n", val); /* TS mode - one or two receivers */ req.addr = AF9015_EEPROM_TS_MODE; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->dual_mode = val; dev_dbg(&intf->dev, "ts mode %02x\n", state->dual_mode); state->af9013_i2c_addr[0] = AF9015_I2C_DEMOD; if (state->dual_mode) { /* read 2nd demodulator I2C address */ req.addr = AF9015_EEPROM_DEMOD2_I2C; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_i2c_addr[1] = val >> 1; } for (i = 0; i < state->dual_mode + 1; i++) { if (i == 1) offset = AF9015_EEPROM_OFFSET; /* xtal */ req.addr = AF9015_EEPROM_XTAL_TYPE1 + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; switch (val) { case 0: state->af9013_pdata[i].clk = 28800000; break; case 1: state->af9013_pdata[i].clk = 20480000; break; case 2: state->af9013_pdata[i].clk = 28000000; break; case 3: state->af9013_pdata[i].clk = 25000000; break; } dev_dbg(&intf->dev, "[%d] xtal %02x, clk %u\n", i, val, state->af9013_pdata[i].clk); /* IF frequency */ req.addr = AF9015_EEPROM_IF1H + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_pdata[i].if_frequency = val << 8; req.addr = AF9015_EEPROM_IF1L + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_pdata[i].if_frequency += val; state->af9013_pdata[i].if_frequency *= 1000; dev_dbg(&intf->dev, "[%d] if frequency %u\n", i, state->af9013_pdata[i].if_frequency); /* MT2060 IF1 */ req.addr = AF9015_EEPROM_MT2060_IF1H + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->mt2060_if1[i] = val << 8; req.addr = AF9015_EEPROM_MT2060_IF1L + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->mt2060_if1[i] += val; dev_dbg(&intf->dev, "[%d] MT2060 IF1 %u\n", i, state->mt2060_if1[i]); /* tuner */ req.addr = AF9015_EEPROM_TUNER_ID1 + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; switch (val) { case AF9013_TUNER_ENV77H11D5: case AF9013_TUNER_MT2060: case AF9013_TUNER_QT1010: case AF9013_TUNER_UNKNOWN: case AF9013_TUNER_MT2060_2: case AF9013_TUNER_TDA18271: case AF9013_TUNER_QT1010A: case AF9013_TUNER_TDA18218: state->af9013_pdata[i].spec_inv = 1; break; case AF9013_TUNER_MXL5003D: case AF9013_TUNER_MXL5005D: case AF9013_TUNER_MXL5005R: case AF9013_TUNER_MXL5007T: state->af9013_pdata[i].spec_inv = 0; break; case AF9013_TUNER_MC44S803: state->af9013_pdata[i].gpio[1] = AF9013_GPIO_LO; state->af9013_pdata[i].spec_inv = 1; break; default: dev_err(&intf->dev, "tuner id %02x not supported, please report!\n", val); return -ENODEV; } state->af9013_pdata[i].tuner = val; dev_dbg(&intf->dev, "[%d] tuner id %02x\n", i, val); } error: if (ret) dev_err(&intf->dev, "eeprom read failed %d\n", ret); /* * AverMedia AVerTV Volar Black HD (A850) device have bad EEPROM * content :-( Override some wrong values here. Ditto for the * AVerTV Red HD+ (A850T) device. */ if (le16_to_cpu(d->udev->descriptor.idVendor) == USB_VID_AVERMEDIA && ((le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_AVERMEDIA_A850) || (le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_AVERMEDIA_A850T))) { dev_dbg(&intf->dev, "AverMedia A850: overriding config\n"); /* disable dual mode */ state->dual_mode = 0; /* set correct IF */ state->af9013_pdata[0].if_frequency = 4570000; } return ret; } static int af9015_get_stream_config(struct dvb_frontend *fe, u8 *ts_type, struct usb_data_stream_properties *stream) { struct dvb_usb_device *d = fe_to_d(fe); struct usb_interface *intf = d->intf; dev_dbg(&intf->dev, "adap %u\n", fe_to_adap(fe)->id); if (d->udev->speed == USB_SPEED_FULL) stream->u.bulk.buffersize = 5 * 188; return 0; } static int af9015_streaming_ctrl(struct dvb_frontend *fe, int onoff) { struct dvb_usb_device *d = fe_to_d(fe); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; unsigned int utmp1, utmp2, reg1, reg2; u8 buf[2]; const unsigned int adap_id = fe_to_adap(fe)->id; dev_dbg(&intf->dev, "adap id %d, onoff %d\n", adap_id, onoff); if (!state->usb_ts_if_configured[adap_id]) { dev_dbg(&intf->dev, "set usb and ts interface\n"); /* USB IF stream settings */ utmp1 = (d->udev->speed == USB_SPEED_FULL ? 5 : 87) * 188 / 4; utmp2 = (d->udev->speed == USB_SPEED_FULL ? 64 : 512) / 4; buf[0] = (utmp1 >> 0) & 0xff; buf[1] = (utmp1 >> 8) & 0xff; if (adap_id == 0) { /* 1st USB IF (EP4) stream settings */ reg1 = 0xdd88; reg2 = 0xdd0c; } else { /* 2nd USB IF (EP5) stream settings */ reg1 = 0xdd8a; reg2 = 0xdd0d; } ret = regmap_bulk_write(state->regmap, reg1, buf, 2); if (ret) goto err; ret = regmap_write(state->regmap, reg2, utmp2); if (ret) goto err; /* TS IF settings */ if (state->dual_mode) { utmp1 = 0x01; utmp2 = 0x10; } else { utmp1 = 0x00; utmp2 = 0x00; } ret = regmap_update_bits(state->regmap, 0xd50b, 0x01, utmp1); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd520, 0x10, utmp2); if (ret) goto err; state->usb_ts_if_configured[adap_id] = true; } if (adap_id == 0 && onoff) { /* Adapter 0 stream on. EP4: clear NAK, enable, clear reset */ ret = regmap_update_bits(state->regmap, 0xdd13, 0x20, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x20, 0x20); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd507, 0x04, 0x00); if (ret) goto err; } else if (adap_id == 1 && onoff) { /* Adapter 1 stream on. EP5: clear NAK, enable, clear reset */ ret = regmap_update_bits(state->regmap, 0xdd13, 0x40, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x40, 0x40); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd50b, 0x02, 0x00); if (ret) goto err; } else if (adap_id == 0 && !onoff) { /* Adapter 0 stream off. EP4: set reset, disable, set NAK */ ret = regmap_update_bits(state->regmap, 0xd507, 0x04, 0x04); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x20, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd13, 0x20, 0x20); if (ret) goto err; } else if (adap_id == 1 && !onoff) { /* Adapter 1 stream off. EP5: set reset, disable, set NAK */ ret = regmap_update_bits(state->regmap, 0xd50b, 0x02, 0x02); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x40, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd13, 0x40, 0x40); if (ret) goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_get_adapter_count(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); return state->dual_mode + 1; } /* override demod callbacks for resource locking */ static int af9015_af9013_set_frontend(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->set_frontend[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_read_status(struct dvb_frontend *fe, enum fe_status *status) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->read_status[fe_to_adap(fe)->id](fe, status); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_init(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->init[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_sleep(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->sleep[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override tuner callbacks for resource locking */ static int af9015_tuner_init(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->tuner_init[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override tuner callbacks for resource locking */ static int af9015_tuner_sleep(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->tuner_sleep[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_copy_firmware(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; unsigned long timeout; u8 val, firmware_info[4]; struct req_t req = {COPY_FIRMWARE, 0, 0x5100, 0, 0, 4, firmware_info}; dev_dbg(&intf->dev, "\n"); firmware_info[0] = (state->firmware_size >> 8) & 0xff; firmware_info[1] = (state->firmware_size >> 0) & 0xff; firmware_info[2] = (state->firmware_checksum >> 8) & 0xff; firmware_info[3] = (state->firmware_checksum >> 0) & 0xff; /* Check whether firmware is already running */ ret = af9015_read_reg_i2c(d, state->af9013_i2c_addr[1], 0x98be, &val); if (ret) goto err; dev_dbg(&intf->dev, "firmware status %02x\n", val); if (val == 0x0c) return 0; /* Set i2c clock to 625kHz to speed up firmware copy */ ret = regmap_write(state->regmap, 0xd416, 0x04); if (ret) goto err; /* Copy firmware from master demod to slave demod */ ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware copy cmd failed %d\n", ret); goto err; } /* Set i2c clock to 125kHz */ ret = regmap_write(state->regmap, 0xd416, 0x14); if (ret) goto err; /* Boot firmware */ ret = af9015_write_reg_i2c(d, state->af9013_i2c_addr[1], 0xe205, 0x01); if (ret) goto err; /* Poll firmware ready */ for (val = 0x00, timeout = jiffies + msecs_to_jiffies(1000); !time_after(jiffies, timeout) && val != 0x0c && val != 0x04;) { msleep(20); /* Check firmware status. 0c=OK, 04=fail */ ret = af9015_read_reg_i2c(d, state->af9013_i2c_addr[1], 0x98be, &val); if (ret) goto err; dev_dbg(&intf->dev, "firmware status %02x\n", val); } dev_dbg(&intf->dev, "firmware boot took %u ms\n", jiffies_to_msecs(jiffies) - (jiffies_to_msecs(timeout) - 1000)); if (val == 0x04) { ret = -ENODEV; dev_err(&intf->dev, "firmware did not run\n"); goto err; } else if (val != 0x0c) { ret = -ETIMEDOUT; dev_err(&intf->dev, "firmware boot timeout\n"); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_af9013_frontend_attach(struct dvb_usb_adapter *adap) { struct af9015_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct usb_interface *intf = d->intf; struct i2c_client *client; int ret; dev_dbg(&intf->dev, "adap id %u\n", adap->id); if (adap->id == 0) { state->af9013_pdata[0].ts_mode = AF9013_TS_MODE_USB; memcpy(state->af9013_pdata[0].api_version, "\x0\x1\x9\x0", 4); state->af9013_pdata[0].gpio[0] = AF9013_GPIO_HI; state->af9013_pdata[0].gpio[3] = AF9013_GPIO_TUNER_ON; } else if (adap->id == 1) { state->af9013_pdata[1].ts_mode = AF9013_TS_MODE_SERIAL; state->af9013_pdata[1].ts_output_pin = 7; memcpy(state->af9013_pdata[1].api_version, "\x0\x1\x9\x0", 4); state->af9013_pdata[1].gpio[0] = AF9013_GPIO_TUNER_ON; state->af9013_pdata[1].gpio[1] = AF9013_GPIO_LO; /* copy firmware to 2nd demodulator */ if (state->dual_mode) { /* Wait 2nd demodulator ready */ msleep(100); ret = af9015_copy_firmware(adap_to_d(adap)); if (ret) { dev_err(&intf->dev, "firmware copy to 2nd frontend failed, will disable it\n"); state->dual_mode = 0; goto err; } } else { ret = -ENODEV; goto err; } } /* Add I2C demod */ client = dvb_module_probe("af9013", NULL, &d->i2c_adap, state->af9013_i2c_addr[adap->id], &state->af9013_pdata[adap->id]); if (!client) { ret = -ENODEV; goto err; } adap->fe[0] = state->af9013_pdata[adap->id].get_dvb_frontend(client); state->demod_i2c_client[adap->id] = client; /* * AF9015 firmware does not like if it gets interrupted by I2C adapter * request on some critical phases. During normal operation I2C adapter * is used only 2nd demodulator and tuner on dual tuner devices. * Override demodulator callbacks and use mutex for limit access to * those "critical" paths to keep AF9015 happy. */ if (adap->fe[0]) { state->set_frontend[adap->id] = adap->fe[0]->ops.set_frontend; adap->fe[0]->ops.set_frontend = af9015_af9013_set_frontend; state->read_status[adap->id] = adap->fe[0]->ops.read_status; adap->fe[0]->ops.read_status = af9015_af9013_read_status; state->init[adap->id] = adap->fe[0]->ops.init; adap->fe[0]->ops.init = af9015_af9013_init; state->sleep[adap->id] = adap->fe[0]->ops.sleep; adap->fe[0]->ops.sleep = af9015_af9013_sleep; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_frontend_detach(struct dvb_usb_adapter *adap) { struct af9015_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct usb_interface *intf = d->intf; struct i2c_client *client; dev_dbg(&intf->dev, "adap id %u\n", adap->id); /* Remove I2C demod */ client = state->demod_i2c_client[adap->id]; dvb_module_release(client); return 0; } static struct mt2060_config af9015_mt2060_config = { .i2c_address = 0x60, .clock_out = 0, }; static struct qt1010_config af9015_qt1010_config = { .i2c_address = 0x62, }; static struct tda18271_config af9015_tda18271_config = { .gate = TDA18271_GATE_DIGITAL, .small_i2c = TDA18271_16_BYTE_CHUNK_INIT, }; static struct mxl5005s_config af9015_mxl5003_config = { .i2c_address = 0x63, .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_DEFAULT, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static struct mxl5005s_config af9015_mxl5005_config = { .i2c_address = 0x63, .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_OFF, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static struct mc44s803_config af9015_mc44s803_config = { .i2c_address = 0x60, .dig_out = 1, }; static struct tda18218_config af9015_tda18218_config = { .i2c_address = 0x60, .i2c_wr_max = 21, /* max wr bytes AF9015 I2C adap can handle at once */ }; static struct mxl5007t_config af9015_mxl5007t_config = { .xtal_freq_hz = MxL_XTAL_24_MHZ, .if_freq_hz = MxL_IF_4_57_MHZ, }; static int af9015_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; struct i2c_client *client; struct i2c_adapter *adapter; int ret; dev_dbg(&intf->dev, "adap id %u\n", adap->id); client = state->demod_i2c_client[adap->id]; adapter = state->af9013_pdata[adap->id].get_i2c_adapter(client); switch (state->af9013_pdata[adap->id].tuner) { case AF9013_TUNER_MT2060: case AF9013_TUNER_MT2060_2: ret = dvb_attach(mt2060_attach, adap->fe[0], adapter, &af9015_mt2060_config, state->mt2060_if1[adap->id]) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_QT1010: case AF9013_TUNER_QT1010A: ret = dvb_attach(qt1010_attach, adap->fe[0], adapter, &af9015_qt1010_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_TDA18271: ret = dvb_attach(tda18271_attach, adap->fe[0], 0x60, adapter, &af9015_tda18271_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_TDA18218: ret = dvb_attach(tda18218_attach, adap->fe[0], adapter, &af9015_tda18218_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5003D: ret = dvb_attach(mxl5005s_attach, adap->fe[0], adapter, &af9015_mxl5003_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5005D: case AF9013_TUNER_MXL5005R: ret = dvb_attach(mxl5005s_attach, adap->fe[0], adapter, &af9015_mxl5005_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_ENV77H11D5: ret = dvb_attach(dvb_pll_attach, adap->fe[0], 0x60, adapter, DVB_PLL_TDA665X) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MC44S803: ret = dvb_attach(mc44s803_attach, adap->fe[0], adapter, &af9015_mc44s803_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5007T: ret = dvb_attach(mxl5007t_attach, adap->fe[0], adapter, 0x60, &af9015_mxl5007t_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_UNKNOWN: default: dev_err(&intf->dev, "unknown tuner, tuner id %02x\n", state->af9013_pdata[adap->id].tuner); ret = -ENODEV; } if (adap->fe[0]->ops.tuner_ops.init) { state->tuner_init[adap->id] = adap->fe[0]->ops.tuner_ops.init; adap->fe[0]->ops.tuner_ops.init = af9015_tuner_init; } if (adap->fe[0]->ops.tuner_ops.sleep) { state->tuner_sleep[adap->id] = adap->fe[0]->ops.tuner_ops.sleep; adap->fe[0]->ops.tuner_ops.sleep = af9015_tuner_sleep; } return ret; } static int af9015_pid_filter_ctrl(struct dvb_usb_adapter *adap, int onoff) { struct af9015_state *state = adap_to_priv(adap); struct af9013_platform_data *pdata = &state->af9013_pdata[adap->id]; int ret; mutex_lock(&state->fe_mutex); ret = pdata->pid_filter_ctrl(adap->fe[0], onoff); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_pid_filter(struct dvb_usb_adapter *adap, int index, u16 pid, int onoff) { struct af9015_state *state = adap_to_priv(adap); struct af9013_platform_data *pdata = &state->af9013_pdata[adap->id]; int ret; mutex_lock(&state->fe_mutex); ret = pdata->pid_filter(adap->fe[0], index, pid, onoff); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_init(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; dev_dbg(&intf->dev, "\n"); mutex_init(&state->fe_mutex); /* init RC canary */ ret = regmap_write(state->regmap, 0x98e9, 0xff); if (ret) goto error; error: return ret; } #if IS_ENABLED(CONFIG_RC_CORE) struct af9015_rc_setup { unsigned int id; char *rc_codes; }; static char *af9015_rc_setup_match(unsigned int id, const struct af9015_rc_setup *table) { for (; table->rc_codes; table++) if (table->id == id) return table->rc_codes; return NULL; } static const struct af9015_rc_setup af9015_rc_setup_modparam[] = { { AF9015_REMOTE_A_LINK_DTU_M, RC_MAP_ALINK_DTU_M }, { AF9015_REMOTE_MSI_DIGIVOX_MINI_II_V3, RC_MAP_MSI_DIGIVOX_II }, { AF9015_REMOTE_MYGICTV_U718, RC_MAP_TOTAL_MEDIA_IN_HAND }, { AF9015_REMOTE_DIGITTRADE_DVB_T, RC_MAP_DIGITTRADE }, { AF9015_REMOTE_AVERMEDIA_KS, RC_MAP_AVERMEDIA_RM_KS }, { } }; static const struct af9015_rc_setup af9015_rc_setup_hashes[] = { { 0xb8feb708, RC_MAP_MSI_DIGIVOX_II }, { 0xa3703d00, RC_MAP_ALINK_DTU_M }, { 0x9b7dc64e, RC_MAP_TOTAL_MEDIA_IN_HAND }, /* MYGICTV U718 */ { 0x5d49e3db, RC_MAP_DIGITTRADE }, /* LC-Power LC-USB-DVBT */ { } }; static int af9015_rc_query(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u8 buf[17]; /* read registers needed to detect remote controller code */ ret = regmap_bulk_read(state->regmap, 0x98d9, buf, sizeof(buf)); if (ret) goto error; /* If any of these are non-zero, assume invalid data */ if (buf[1] || buf[2] || buf[3]) { dev_dbg(&intf->dev, "invalid data\n"); return 0; } /* Check for repeat of previous code */ if ((state->rc_repeat != buf[6] || buf[0]) && !memcmp(&buf[12], state->rc_last, 4)) { dev_dbg(&intf->dev, "key repeated\n"); rc_repeat(d->rc_dev); state->rc_repeat = buf[6]; return 0; } /* Only process key if canary killed */ if (buf[16] != 0xff && buf[0] != 0x01) { enum rc_proto proto; dev_dbg(&intf->dev, "key pressed %*ph\n", 4, buf + 12); /* Reset the canary */ ret = regmap_write(state->regmap, 0x98e9, 0xff); if (ret) goto error; /* Remember this key */ memcpy(state->rc_last, &buf[12], 4); if (buf[14] == (u8)~buf[15]) { if (buf[12] == (u8)~buf[13]) { /* NEC */ state->rc_keycode = RC_SCANCODE_NEC(buf[12], buf[14]); proto = RC_PROTO_NEC; } else { /* NEC extended*/ state->rc_keycode = RC_SCANCODE_NECX(buf[12] << 8 | buf[13], buf[14]); proto = RC_PROTO_NECX; } } else { /* 32 bit NEC */ state->rc_keycode = RC_SCANCODE_NEC32(buf[12] << 24 | buf[13] << 16 | buf[14] << 8 | buf[15]); proto = RC_PROTO_NEC32; } rc_keydown(d->rc_dev, proto, state->rc_keycode, 0); } else { dev_dbg(&intf->dev, "no key press\n"); /* Invalidate last keypress */ /* Not really needed, but helps with debug */ state->rc_last[2] = state->rc_last[3]; } state->rc_repeat = buf[6]; state->rc_failed = false; error: if (ret) { dev_warn(&intf->dev, "rc query failed %d\n", ret); /* allow random errors as dvb-usb will stop polling on error */ if (!state->rc_failed) ret = 0; state->rc_failed = true; } return ret; } static int af9015_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc) { struct af9015_state *state = d_to_priv(d); u16 vid = le16_to_cpu(d->udev->descriptor.idVendor); if (state->ir_mode == AF9015_IR_MODE_DISABLED) return 0; /* try to load remote based module param */ if (!rc->map_name) rc->map_name = af9015_rc_setup_match(dvb_usb_af9015_remote, af9015_rc_setup_modparam); /* try to load remote based eeprom hash */ if (!rc->map_name) rc->map_name = af9015_rc_setup_match(state->eeprom_sum, af9015_rc_setup_hashes); /* try to load remote based USB iManufacturer string */ if (!rc->map_name && vid == USB_VID_AFATECH) { /* * Check USB manufacturer and product strings and try * to determine correct remote in case of chip vendor * reference IDs are used. * DO NOT ADD ANYTHING NEW HERE. Use hashes instead. */ char manufacturer[10]; memset(manufacturer, 0, sizeof(manufacturer)); usb_string(d->udev, d->udev->descriptor.iManufacturer, manufacturer, sizeof(manufacturer)); if (!strcmp("MSI", manufacturer)) { /* * iManufacturer 1 MSI * iProduct 2 MSI K-VOX */ rc->map_name = af9015_rc_setup_match(AF9015_REMOTE_MSI_DIGIVOX_MINI_II_V3, af9015_rc_setup_modparam); } } /* load empty to enable rc */ if (!rc->map_name) rc->map_name = RC_MAP_EMPTY; rc->allowed_protos = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32; rc->query = af9015_rc_query; rc->interval = 500; return 0; } #else #define af9015_get_rc_config NULL #endif static int af9015_regmap_write(void *context, const void *data, size_t count) { struct dvb_usb_device *d = context; struct usb_interface *intf = d->intf; int ret; u16 reg = ((u8 *)data)[0] << 8 | ((u8 *)data)[1] << 0; u8 *val = &((u8 *)data)[2]; const unsigned int len = count - 2; struct req_t req = {WRITE_MEMORY, 0, reg, 0, 0, len, val}; ret = af9015_ctrl_msg(d, &req); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_regmap_read(void *context, const void *reg_buf, size_t reg_size, void *val_buf, size_t val_size) { struct dvb_usb_device *d = context; struct usb_interface *intf = d->intf; int ret; u16 reg = ((u8 *)reg_buf)[0] << 8 | ((u8 *)reg_buf)[1] << 0; u8 *val = &((u8 *)val_buf)[0]; const unsigned int len = val_size; struct req_t req = {READ_MEMORY, 0, reg, 0, 0, len, val}; ret = af9015_ctrl_msg(d, &req); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_probe(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; struct usb_device *udev = interface_to_usbdev(intf); int ret; char manufacturer[sizeof("ITE Technologies, Inc.")]; static const struct regmap_config regmap_config = { .reg_bits = 16, .val_bits = 8, }; static const struct regmap_bus regmap_bus = { .read = af9015_regmap_read, .write = af9015_regmap_write, }; dev_dbg(&intf->dev, "\n"); memset(manufacturer, 0, sizeof(manufacturer)); usb_string(udev, udev->descriptor.iManufacturer, manufacturer, sizeof(manufacturer)); /* * There is two devices having same ID but different chipset. One uses * AF9015 and the other IT9135 chipset. Only difference seen on lsusb * is iManufacturer string. * * idVendor 0x0ccd TerraTec Electronic GmbH * idProduct 0x0099 * bcdDevice 2.00 * iManufacturer 1 Afatech * iProduct 2 DVB-T 2 * * idVendor 0x0ccd TerraTec Electronic GmbH * idProduct 0x0099 * bcdDevice 2.00 * iManufacturer 1 ITE Technologies, Inc. * iProduct 2 DVB-T TV Stick */ if ((le16_to_cpu(udev->descriptor.idVendor) == USB_VID_TERRATEC) && (le16_to_cpu(udev->descriptor.idProduct) == 0x0099)) { if (!strcmp("ITE Technologies, Inc.", manufacturer)) { ret = -ENODEV; dev_dbg(&intf->dev, "rejecting device\n"); goto err; } } state->regmap = regmap_init(&intf->dev, ®map_bus, d, ®map_config); if (IS_ERR(state->regmap)) { ret = PTR_ERR(state->regmap); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static void af9015_disconnect(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; dev_dbg(&intf->dev, "\n"); regmap_exit(state->regmap); } /* * Interface 0 is used by DVB-T receiver and * interface 1 is for remote controller (HID) */ static const struct dvb_usb_device_properties af9015_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct af9015_state), .generic_bulk_ctrl_endpoint = 0x02, .generic_bulk_ctrl_endpoint_response = 0x81, .probe = af9015_probe, .disconnect = af9015_disconnect, .identify_state = af9015_identify_state, .firmware = AF9015_FIRMWARE, .download_firmware = af9015_download_firmware, .i2c_algo = &af9015_i2c_algo, .read_config = af9015_read_config, .frontend_attach = af9015_af9013_frontend_attach, .frontend_detach = af9015_frontend_detach, .tuner_attach = af9015_tuner_attach, .init = af9015_init, .get_rc_config = af9015_get_rc_config, .get_stream_config = af9015_get_stream_config, .streaming_ctrl = af9015_streaming_ctrl, .get_adapter_count = af9015_get_adapter_count, .adapter = { { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .pid_filter = af9015_pid_filter, .pid_filter_ctrl = af9015_pid_filter_ctrl, .stream = DVB_USB_STREAM_BULK(0x84, 6, 87 * 188), }, { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .pid_filter = af9015_pid_filter, .pid_filter_ctrl = af9015_pid_filter_ctrl, .stream = DVB_USB_STREAM_BULK(0x85, 6, 87 * 188), }, }, }; static const struct usb_device_id af9015_id_table[] = { { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_AFATECH_AF9015_9015, &af9015_props, "Afatech AF9015 reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_AFATECH_AF9015_9016, &af9015_props, "Afatech AF9015 reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV_DONGLE_GOLD, &af9015_props, "Leadtek WinFast DTV Dongle Gold", RC_MAP_LEADTEK_Y04G0051) }, { DVB_USB_DEVICE(USB_VID_PINNACLE, USB_PID_PINNACLE_PCTV71E, &af9015_props, "Pinnacle PCTV 71e", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_399U, &af9015_props, "KWorld PlusTV Dual DVB-T Stick (DVB-T 399U)", NULL) }, { DVB_USB_DEVICE(USB_VID_VISIONPLUS, USB_PID_TINYTWIN, &af9015_props, "DigitalNow TinyTwin", RC_MAP_AZUREWAVE_AD_TU700) }, { DVB_USB_DEVICE(USB_VID_VISIONPLUS, USB_PID_AZUREWAVE_AD_TU700, &af9015_props, "TwinHan AzureWave AD-TU700(704J)", RC_MAP_AZUREWAVE_AD_TU700) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_USB_XE_REV2, &af9015_props, "TerraTec Cinergy T USB XE", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_PC160_2T, &af9015_props, "KWorld PlusTV Dual DVB-T PCI (DVB-T PC160-2T)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_VOLAR_X, &af9015_props, "AVerMedia AVerTV DVB-T Volar X", RC_MAP_AVERMEDIA_M135A) }, { DVB_USB_DEVICE(USB_VID_XTENSIONS, USB_PID_XTENSIONS_XD_380, &af9015_props, "Xtensions XD-380", NULL) }, { DVB_USB_DEVICE(USB_VID_MSI_2, USB_PID_MSI_DIGIVOX_DUO, &af9015_props, "MSI DIGIVOX Duo", RC_MAP_MSI_DIGIVOX_III) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_VOLAR_X_2, &af9015_props, "Fujitsu-Siemens Slim Mobile USB DVB-T", NULL) }, { DVB_USB_DEVICE(USB_VID_TELESTAR, USB_PID_TELESTAR_STARSTICK_2, &af9015_props, "Telestar Starstick 2", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A309, &af9015_props, "AVerMedia A309", NULL) }, { DVB_USB_DEVICE(USB_VID_MSI_2, USB_PID_MSI_DIGI_VOX_MINI_III, &af9015_props, "MSI Digi VOX mini III", RC_MAP_MSI_DIGIVOX_III) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_2, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_3, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_TREKSTOR_DVBT, &af9015_props, "TrekStor DVB-T USB Stick", RC_MAP_TREKSTOR) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A850, &af9015_props, "AverMedia AVerTV Volar Black HD (A850)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A805, &af9015_props, "AverMedia AVerTV Volar GPS 805 (A805)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_CONCEPTRONIC_CTVDIGRCU, &af9015_props, "Conceptronic USB2.0 DVB-T CTVDIGRCU V3.0", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_MC810, &af9015_props, "KWorld Digital MC-810", NULL) }, { DVB_USB_DEVICE(USB_VID_KYE, USB_PID_GENIUS_TVGO_DVB_T03, &af9015_props, "Genius TVGo DVB-T03", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_399U_2, &af9015_props, "KWorld PlusTV Dual DVB-T Stick (DVB-T 399U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_PC160_T, &af9015_props, "KWorld PlusTV DVB-T PCI Pro Card (DVB-T PC160-T)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV20, &af9015_props, "Sveon STV20 Tuner USB DVB-T HDTV", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_TINYTWIN_2, &af9015_props, "DigitalNow TinyTwin v2", RC_MAP_DIGITALNOW_TINYTWIN) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV2000DS, &af9015_props, "Leadtek WinFast DTV2000DS", RC_MAP_LEADTEK_Y04G0051) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_UB383_T, &af9015_props, "KWorld USB DVB-T Stick Mobile (UB383-T)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_4, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A815M, &af9015_props, "AverMedia AVerTV Volar M (A815Mac)", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_STICK_RC, &af9015_props, "TerraTec Cinergy T Stick RC", RC_MAP_TERRATEC_SLIM_2) }, /* XXX: that same ID [0ccd:0099] is used by af9035 driver too */ { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_STICK_DUAL_RC, &af9015_props, "TerraTec Cinergy T Stick Dual RC", RC_MAP_TERRATEC_SLIM) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A850T, &af9015_props, "AverMedia AVerTV Red HD+ (A850T)", NULL) }, { DVB_USB_DEVICE(USB_VID_GTEK, USB_PID_TINYTWIN_3, &af9015_props, "DigitalNow TinyTwin v3", RC_MAP_DIGITALNOW_TINYTWIN) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV22, &af9015_props, "Sveon STV22 Dual USB DVB-T Tuner HDTV", RC_MAP_MSI_DIGIVOX_III) }, { } }; MODULE_DEVICE_TABLE(usb, af9015_id_table); /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver af9015_usb_driver = { .name = KBUILD_MODNAME, .id_table = af9015_id_table, .probe = dvb_usbv2_probe, .disconnect = dvb_usbv2_disconnect, .suspend = dvb_usbv2_suspend, .resume = dvb_usbv2_resume, .reset_resume = dvb_usbv2_reset_resume, .no_dynamic_id = 1, .soft_unbind = 1, }; module_usb_driver(af9015_usb_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("Afatech AF9015 driver"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(AF9015_FIRMWARE); |
| 4 1 11 5 5 1 1 5 5 5 5 10 9 1 10 10 10 45 45 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 | /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ /* * Handle hardware traps and faults. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/static_call.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> #include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> #include <asm/fpu/xstate.h> #include <asm/vm86.h> #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> #include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> #endif #include <asm/proto.h> DECLARE_BITMAP(system_vectors, NR_VECTORS); __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) return 0; /* * We got #UD, if the text isn't readable we'd have gotten * a different exception. */ return *(unsigned short *)addr == INSN_UD2; } /* * Check for UD1 or UD2, accounting for Address Size Override Prefixes. * If it's a UD1, further decode to determine its use: * * FineIBT: d6 udb * FineIBT: f0 75 f9 lock jne . - 6 * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg * * Notable, since __WARN_trap can use all registers, the distinction between * UD1 users is through R/M. */ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { unsigned long start = addr; u8 v, reg, rm, rex = 0; int type = BUG_UD1; bool lock = false; if (addr < TASK_SIZE_MAX) return BUG_NONE; for (;;) { v = *(u8 *)(addr++); if (v == INSN_ASOP) continue; if (v == INSN_LOCK) { lock = true; continue; } if ((v & 0xf0) == 0x40) { rex = v; continue; } break; } switch (v) { case 0x70 ... 0x7f: /* Jcc.d8 */ addr += 1; /* d8 */ *len = addr - start; WARN_ON_ONCE(!lock); return BUG_LOCK; case 0xd6: *len = addr - start; return BUG_UDB; case OPCODE_ESCAPE: break; default: return BUG_NONE; } v = *(u8 *)(addr++); if (v == SECOND_BYTE_OPCODE_UD2) { *len = addr - start; return BUG_UD2; } if (v != SECOND_BYTE_OPCODE_UD1) return BUG_NONE; *imm = 0; v = *(u8 *)(addr++); /* ModRM */ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) addr++; /* SIB */ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex); rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex); /* Decode immediate, if present */ switch (X86_MODRM_MOD(v)) { case 0: if (X86_MODRM_RM(v) == 5) addr += 4; /* RIP + disp32 */ if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; if (rm == 2) { /* (%edx) */ *imm = reg; type = BUG_UD1_WARN; } break; case 1: *imm = *(s8 *)addr; addr += 1; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 2: *imm = *(s32 *)addr; addr += 4; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 3: break; } /* record instruction length */ *len = addr - start; return type; } static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr) { int offset = pt_regs_offset(regs, nr); if (WARN_ON_ONCE(offset < -0)) return 0; return *((unsigned long *)((void *)regs + offset)); } #ifdef HAVE_ARCH_BUG_FORMAT_ARGS DEFINE_STATIC_CALL(WARN_trap, __WARN_trap); EXPORT_STATIC_CALL_TRAMP(WARN_trap); /* * Create a va_list from an exception context. */ void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) { /* * Register save area; populate with function call argument registers */ args->regs[0] = regs->di; args->regs[1] = regs->si; args->regs[2] = regs->dx; args->regs[3] = regs->cx; args->regs[4] = regs->r8; args->regs[5] = regs->r9; /* * From the ABI document: * * @gp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available general purpose * argument register is saved. In case all argument registers have * been exhausted, it is set to the value 48 (6*8). * * @fp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available floating point * argument is saved. In case all argument registers have been * exhausted, it is set to the value 176 (6*8 + 8*16) * * @overflow_arg_area - this pointer is used to fetch arguments passed * on the stack. It is initialized with the address of the first * argument passed on the stack, if any, and then always updated to * point to the start of the next argument on the stack. * * @reg_save_area - the element points to the start of the register * save area. * * Notably the vararg starts with the second argument and there are no * floating point arguments in the kernel. */ args->args.gp_offset = 1*8; args->args.fp_offset = 6*8 + 8*16; args->args.reg_save_area = &args->regs; args->args.overflow_arg_area = (void *)regs->sp; /* * If the exception came from __WARN_trap, there is a return * address on the stack, skip that. This is why any __WARN_trap() * caller must inhibit tail-call optimization. */ if ((void *)regs->ip == &__WARN_trap) args->args.overflow_arg_area += 8; return &args->args; } #endif /* HAVE_ARCH_BUG_FORMAT */ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) { if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ if (trapnr < X86_TRAP_UD) { if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) return 0; } } else if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } else { if (fixup_vdso_exception(regs, trapnr, error_code, 0)) return 0; } /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; return -1; } static void show_signal(struct task_struct *tsk, int signr, const char *type, const char *desc, struct pt_regs *regs, long error_code) { if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), type, desc, regs->ip, regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } } static void do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, int sicode, void __user *addr) { struct task_struct *tsk = current; if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) force_sig(signr); else force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr, int sicode, void __user *addr) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); cond_local_irq_disable(regs); } } /* * Posix requires to provide the address of the faulting instruction for * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. * * This address is usually regs->ip, but when an uprobe moved the code out * of line then regs->ip points to the XOL code which would confuse * anything which analyzes the fault address vs. the unmodified binary. If * a trap happened in XOL code then uprobe maps regs->ip back to the * original instruction address. */ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) { return (void __user *)uprobe_get_trap_addr(regs); } DEFINE_IDTENTRY(exc_divide_error) { do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } DEFINE_IDTENTRY(exc_overflow) { do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else static inline void handle_invalid_op(struct pt_regs *regs) #endif { do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, ILL_ILLOPN, error_get_trap_addr(regs)); } noinstr bool handle_bug(struct pt_regs *regs) { unsigned long addr = regs->ip; bool handled = false; int ud_type, ud_len; s32 ud_imm; ud_type = decode_bug(addr, &ud_imm, &ud_len); if (ud_type == BUG_NONE) return handled; /* * All lies, just get the WARN/BUG out. */ instrumentation_begin(); /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() * is a rare case that uses @regs without passing them to * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); switch (ud_type) { case BUG_UD1_WARN: if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN) handled = true; break; case BUG_UD2: if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } fallthrough; case BUG_UDB: case BUG_LOCK: if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } break; case BUG_UD1_UBSAN: if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", report_ubsan_failure(ud_imm), (void *)regs->ip); } break; default: break; } /* * When continuing, and regs->ip hasn't changed, move it to the next * instruction. When not continuing execution, restore the instruction * pointer. */ if (handled) { if (regs->ip == addr) regs->ip += ud_len; } else { regs->ip = addr; } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); return handled; } DEFINE_IDTENTRY_RAW(exc_invalid_op) { irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such * handle it before exception entry to avoid recursive WARN * in case exception entry is the one triggering WARNs. */ if (!user_mode(regs) && handle_bug(regs)) return; state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) { do_error_trap(regs, 0, "coprocessor segment overrun", X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) { do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) { do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) { do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; if (!user_mode(regs)) die("Split lock detected\n", regs, error_code); local_irq_enable(); if (handle_user_split_lock(regs, error_code)) goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); out: local_irq_disable(); } #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info) { const char *name = stack_type_name(info->type); printk(KERN_EMERG "BUG: %s stack guard page was hit at %px (stack is %px..%px)\n", name, (void *)fault_address, info->begin, info->end); die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ panic("%s stack guard hit", name); } #endif /* * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch * between configs triggers objtool warnings. * * This is a temporary hack until we have compiler or plugin support for * annotating noreturns. */ #ifdef CONFIG_X86_ESPFIX64 #define always_true() true #else bool always_true(void); bool __weak always_true(void) { return true; } #endif /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the * SDM's warnings about double faults being unrecoverable, returning works as * expected. Presumably what the SDM actually means is that the CPU may get * the register state wrong on entry, so returning could be a bad idea. * * Various CPU engineers have promised that double faults due to an IRET fault * while the stack is read-only are, in fact, recoverable. * * On x86_32, this is entered through a task gate, and regs are synthesized * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. * * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs * to be read before doing anything else. */ DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; /* * If IRET takes a non-IST fault on the espfix64 stack, then we * end up promoting it to a doublefault. In that case, take * advantage of the fact that we're not using the normal (TSS.sp0) * stack right now. We can write a fake #GP(0) frame at TSS.sp0 * and then modify our own IRET frame so that, when we return, * we land directly at the #GP(0) vector with the stack already * set up according to its expectations. * * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the * ESPFIX64 stack. Copy it to the entry stack. This fills * in gpregs->ss through gpregs->ip. * */ gpregs->ip = p[0]; gpregs->cs = p[1]; gpregs->flags = p[2]; gpregs->sp = p[3]; gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * * We will enter general_protection with kernel GSBASE, * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; #ifdef CONFIG_VMAP_STACK /* * If we overflow the stack into a guard page, the CPU will fail * to deliver #PF and will send #DF instead. Similarly, if we * take any non-IST exception while too close to the bottom of * the stack, the processor will get a page fault while * delivering the exception and will generate a double fault. * * According to the SDM (footnote in 6.15 under "Interrupt 14 - * Page-Fault Exception (#PF): * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a * double fault. * * The logic below has a small possibility of incorrectly diagnosing * some errors as stack overflows. For example, if the IDT or GDT * gets corrupted such that #GP delivery fails due to a bad descriptor * causing #GP and we hit this condition while CR2 coincidentally * points to the stack guard page, we'll think we overflowed the * stack. Given that we're going to panic one way or another * if this happens, this isn't necessarily worth fixing. * * If necessary, we could improve the test by only diagnosing * a stack overflow if the saved RSP points within 47 bytes of * the bottom of the stack: if RSP == tsk_stack + 48 and we * take an exception, the stack is already aligned and there * will be enough room SS, RSP, RFLAGS, CS, RIP, and a * possible error code, so a stack overflow would *not* double * fault. With any less space left, exception delivery could * fail, and, as a practical matter, we've overflowed the * stack even if the actual trigger for the double fault was * something else. */ if (get_stack_guard_info((void *)address, &info)) handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); if (always_true()) panic("Machine halted."); instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) { if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, 0); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, GP_CANONICAL, GP_LASS_VIOLATION, GP_NULL_POINTER, }; static const char * const kernel_gp_hint_help[] = { [GP_NON_CANONICAL] = "probably for non-canonical address", [GP_CANONICAL] = "maybe for address", [GP_LASS_VIOLATION] = "probably LASS violation for address", [GP_NULL_POINTER] = "kernel NULL pointer dereference", }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure * out whether any part of the access to that address was non-canonical or * across privilege levels. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) return GP_NO_HINT; #ifdef CONFIG_X86_64 /* Operand is in the kernel half */ if (*addr >= ~__VIRTUAL_MASK) return GP_CANONICAL; /* The last byte of the operand is not in the user canonical half */ if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) return GP_NON_CANONICAL; /* * A NULL pointer dereference usually causes a #PF. However, it * can result in a #GP when LASS is active. Provide the same * hint in the rare case that the condition is hit without LASS. */ if (*addr < PAGE_SIZE) return GP_NULL_POINTER; /* * Assume that LASS caused the exception, because the address is * canonical and in the user half. */ if (cpu_feature_enabled(X86_FEATURE_LASS)) return GP_LASS_VIOLATION; #endif return GP_CANONICAL; } #define GPFSTR "general protection fault" static bool fixup_iopl_exception(struct pt_regs *regs) { struct thread_struct *t = ¤t->thread; unsigned char byte; unsigned long ip; if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) return false; if (insn_get_effective_ip(regs, &ip)) return false; if (get_user(byte, (const char __user *)ip)) return false; if (byte != 0xfa && byte != 0xfb) return false; if (!t->iopl_warn && printk_ratelimit()) { pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", current->comm, task_pid_nr(current), ip); print_vma_addr(KERN_CONT " in ", ip); pr_cont("\n"); t->iopl_warn = 1; } regs->ip += 1; return true; } /* * The unprivileged ENQCMD instruction generates #GPs if the * IA32_PASID MSR has not been populated. If possible, populate * the MSR from a PASID previously allocated to the mm. */ static bool try_fixup_enqcmd_gp(void) { #ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* * MSR_IA32_PASID is managed using XSAVE. Directly * writing to the MSR is only possible when fpregs * are valid and the fpstate is not. This is * guaranteed when handling a userspace exception * in *before* interrupts are re-enabled. */ lockdep_assert_irqs_disabled(); /* * Hardware without ENQCMD will not generate * #GPs that can be fixed up here. */ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ if (!mm_valid_pasid(current->mm)) return false; pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. */ if (current->pasid_activated) return false; wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; #else return false; #endif } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str, unsigned long address) { if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; current->thread.trap_nr = trapnr; /* * To be potentially processing a kprobe fault and to trust the result * from kprobe_running(), we have to be non-preemptible. */ if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, trapnr)) return true; return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; } static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str) { current->thread.error_code = error_code; current->thread.trap_nr = trapnr; show_signal(current, SIGSEGV, "", str, regs, error_code); force_sig(SIGSEGV); } DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; unsigned long gp_addr; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); local_irq_disable(); return; } if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); else hint = get_kernel_gp_address(regs, &gp_addr); if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", kernel_gp_hint_help[hint], gp_addr); /* * KASAN is interested only in the non-canonical case, clear it * otherwise. */ if (hint != GP_NON_CANONICAL) gp_addr = 0; die_addr(desc, regs, error_code, gp_addr); exit: cond_local_irq_disable(regs); } static bool do_int3(struct pt_regs *regs) { int res; #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) return true; #endif res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); return res == NOTIFY_STOP; } NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { if (do_int3(regs)) return; cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); } DEFINE_IDTENTRY_RAW(exc_int3) { /* * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. */ if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } } #ifdef CONFIG_X86_64 /* * Help handler running on a per-cpu (IST or entry trampoline) stack * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; } #ifdef CONFIG_AMD_MEM_ENCRYPT asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) { unsigned long sp, *stack; struct stack_info info; struct pt_regs *regs_ret; /* * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { sp = current_top_of_stack(); goto sync; } /* * From here on the RSP value is trusted. Now check whether entry * happened from a safe stack. Not safe are the entry or unknown stacks, * use the fall-back stack instead in this case. */ sp = regs->sp; stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: /* * Found a safe stack - switch to it as if the entry didn't happen via * IST stack. The code below only copies pt_regs, the real switch happens * in assembly code. */ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); regs_ret = (struct pt_regs *)sp; *regs_ret = *regs; return regs_ret; } #endif asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { struct pt_regs tmp, *new_stack; /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault * correctly, we want to move our stack frame to where it would * be had we entered directly on the entry stack (rather than * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(new_stack)); return new_stack; } #endif static bool is_sysenter_singlestep(struct pt_regs *regs) { /* * We don't try for precision here. If we're anywhere in the region of * code that can be single-stepped in the SYSENTER entry path, then * assume that this is a useless single-step trap due to SYSENTER * being invoked with TF set. (We don't know in advance exactly * which instructions will be hit because BTF could plausibly * be set.) */ #ifdef CONFIG_X86_32 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < (unsigned long)__end_SYSENTER_singlestep_region - (unsigned long)__begin_SYSENTER_singlestep_region; #elif defined(CONFIG_IA32_EMULATION) return (regs->ip - (unsigned long)entry_SYSENTER_compat) < (unsigned long)__end_entry_SYSENTER_compat - (unsigned long)entry_SYSENTER_compat; #else return false; #endif } static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; get_debugreg(dr6, 6); dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ /* * The Intel SDM says: * * Certain debug exceptions may clear bits 0-3 of DR6. * * BLD induced #DB clears DR6.BLD and any other debug * exception doesn't modify DR6.BLD. * * RTM induced #DB clears DR6.RTM and any other debug * exception sets DR6.RTM. * * To avoid confusion in identifying debug exceptions, * debug handlers should set DR6.BLD and DR6.RTM, and * clear other DR6 bits before returning. * * Keep it simple: write DR6 with its architectural reset * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ set_debugreg(DR6_RESERVED, 6); return dr6; } /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore * it is possible to get a watchpoint trap here from inside the kernel. * However, the code in ./ptrace.c has ensured that the user can * only set watchpoints on userspace addresses. Therefore the in-kernel * watchpoint trap can only occur in code which is reading/writing * from user space. Such code must not hold kernel locks (since it * can equally take a page fault), therefore it is safe to call * force_sig_info even though that claims and releases locks. * * Code in ./signal.c ensures that the debug control register * is restored before we deliver any signal, and therefore that * user code runs with the correct debug control register even though * we clear it here. * * Being careful here means that we don't have to be as careful in a * lot of more complicated places (task switching can be a bit lazy * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) * * May run on IST stack. */ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) { /* * Notifiers will clear bits in @dr6 to indicate the event has been * consumed - hw_breakpoint_handler(), single_stop_cont(). * * Notifiers will set bits in @virtual_dr6 to indicate the desire * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) return true; return false; } static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions * are exceedingly 'fun'. * * Since this function is NOKPROBE, and that also applies to * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a * HW_BREAKPOINT_W on our stack) * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. * * For FRED, nested #DB should just work fine. But when a watchpoint or * breakpoint is set in the code path which is executed by #DB handler, * it results in an endless recursion and stack overflow. Thus we stay * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* * If something gets miswired and we end up here for a user mode * #DB, we will malfunction. */ WARN_ON_ONCE(user_mode(regs)); if (test_thread_flag(TIF_BLOCKSTEP)) { /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." but PTRACE_BLOCKSTEP requested * it for userspace, but we just took a kernel #DB, so re-set * BTF. */ unsigned long debugctl; rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ if (!cpu_feature_enabled(X86_FEATURE_FRED) && (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* * The kernel doesn't use INT1 */ if (!dr6) goto out; if (notify_debug(regs, &dr6)) goto out; /* * The kernel doesn't use TF single-step outside of: * * - Kprobes, consumed through kprobe_debug_handler() * - KGDB, consumed through notify_debug() * * So if we get here with DR_STEP set, something is wonky. * * A known way to trigger this is through QEMU's GDB stub, * which leaks #DB into the guest and causes IST recursion. */ if (WARN_ON_ONCE(dr6 & DR_STEP)) regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. */ WARN_ON_ONCE(!user_mode(regs)); /* * NB: We can't easily clear DR7 here because * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be * fine. */ irqentry_enter_from_user_mode(regs); instrumentation_begin(); /* * Start the virtual/ptrace DR6 value with just the DR_STEP mask * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. * * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) * even if it is not the result of PTRACE_SINGLESTEP. */ current->thread.virtual_dr6 = (dr6 & DR_STEP); /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ clear_thread_flag(TIF_BLOCKSTEP); /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ icebp = !dr6; if (notify_debug(regs, &dr6)) goto out; /* It's safe to allow irq's after DR6 has been saved */ local_irq_enable(); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); goto out_irq; } /* #DB for bus lock can only be triggered from userspace. */ if (dr6 & DR_BUS_LOCK) handle_bus_lock(regs); /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) send_sigtrap(regs, 0, get_si_code(dr6)); out_irq: local_irq_disable(); out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED /* * When occurred on different ring level, i.e., from user or kernel * context, #DB needs to be handled on different stack: User #DB on * current task stack, while kernel #DB on a dedicated stack. * * This is exactly how FRED event delivery invokes an exception * handler: ring 3 event on level 0 stack, i.e., current task stack; * ring 0 event on the #DB dedicated stack specified in the * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception * entry stub doesn't do stack switch. */ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* CONFIG_X86_FRED */ #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; cond_local_irq_enable(regs); if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, 0, 0)) goto exit; task->thread.error_code = 0; task->thread.trap_nr = trapnr; if (notify_die(DIE_TRAP, str, regs, 0, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, 0); goto exit; } /* * Synchronize the FPU register state to the memory register state * if necessary. This allows the exception handler to inspect it. */ fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: cond_local_irq_disable(regs); } DEFINE_IDTENTRY(exc_coprocessor_error) { math_error(regs, X86_TRAP_MF); } DEFINE_IDTENTRY(exc_simd_coprocessor_error) { if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ if (!static_cpu_has(X86_FEATURE_XMM)) { __exc_general_protection(regs, 0); return; } } math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: * * PROBLEM: If the APIC subsystem is configured in mixed mode with * Virtual Wire mode implemented through the local APIC, an * interrupt vector of 0Fh (Intel reserved encoding) may be * generated by the local APIC (Int 15). This vector may be * generated upon receipt of a spurious interrupt (an interrupt * which is removed before the system receives the INTA sequence) * instead of the programmed 8259 spurious interrupt vector. * * IMPLICATION: The spurious interrupt vector programmed in the * 8259 is normally handled by an operating system's spurious * interrupt handler. However, a vector of 0Fh is unknown to some * operating systems, which would crash if this erratum occurred. * * In theory this could be limited to 32bit, but the handler is not * hurting and who knows which other CPUs suffer from this. */ } static bool handle_xfd_event(struct pt_regs *regs) { u64 xfd_err; int err; if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) return false; local_irq_enable(); err = xfd_enable_feature(xfd_err); switch (err) { case -EPERM: force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); break; case -EFAULT: force_sig(SIGSEGV); break; } local_irq_disable(); return true; } DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); if (handle_xfd_event(regs)) return; #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); cond_local_irq_disable(regs); return; } #endif /* This should not happen. */ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { /* Try to fix it up and carry on. */ write_cr0(cr0 & ~X86_CR0_TS); } else { /* * Something terrible happened, and we're better off trying * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ die("unexpected #NM exception", regs, 0); } } #ifdef CONFIG_INTEL_TDX_GUEST #define VE_FAULT_STR "VE fault" static void ve_raise_fault(struct pt_regs *regs, long error_code, unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR, address)) { return; } die_addr(VE_FAULT_STR, regs, error_code, address); } /* * Virtualization Exceptions (#VE) are delivered to TDX guests due to * specific guest actions which may happen in either user space or the * kernel: * * * Specific instructions (WBINVD, for example) * * Specific MSR accesses * * Specific CPUID leaf accesses * * Access to specific guest physical addresses * * In the settings that Linux will run in, virtualization exceptions are * never generated on accesses to normal, TD-private memory that has been * accepted (by BIOS or with tdx_enc_status_changed()). * * Syscall entry code has a critical window where the kernel stack is not * yet set up. Any exception in this window leads to hard to debug issues * and can be exploited for privilege escalation. Exceptions in the NMI * entry code also cause issues. Returning from the exception handler with * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. * * For these reasons, the kernel avoids #VEs during the syscall gap and * the NMI entry code. Entry code paths do not access TD-shared memory, * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves * that might generate #VE. VMM can remove memory from TD at any point, * but access to unaccepted (or missing) private memory leads to VM * termination, not to #VE. * * Similarly to page faults and breakpoints, #VEs are allowed in NMI * handlers once the kernel is ready to deal with nested NMIs. * * During #VE delivery, all interrupts, including NMIs, are blocked until * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads * the VE info. * * If a guest kernel action which would normally cause a #VE occurs in * the interrupt-disabled region before TDGETVEINFO, a #DF (fault * exception) is delivered to the guest which will result in an oops. * * The entry code has been audited carefully for following these expectations. * Changes in the entry code have to be audited for correctness vs. this * aspect. Similarly to #PF, #VE in these places will expose kernel to * privilege escalation or may lead to random crashes. */ DEFINE_IDTENTRY(exc_virtualization_exception) { struct ve_info ve; /* * NMIs/Machine-checks/Interrupts will be in a disabled state * till TDGETVEINFO TDCALL is executed. This ensures that VE * info cannot be overwritten by a nested #VE. */ tdx_get_ve_info(&ve); cond_local_irq_enable(regs); /* * If tdx_handle_virt_exception() could not process * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } #endif #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { local_irq_enable(); if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); } #endif void __init trap_init(void) { /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_setup_traps(); cpu_init(); } |
| 5 4 2 9 9 9 1 1 1 1 1 1 1 1 1 47 3 16 5 11 11 2 9 9 3 8 8 8 1 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | // SPDX-License-Identifier: GPL-2.0 /* * Support for async notification of waitid */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "cancel.h" #include "waitid.h" #include "../kernel/exit.h" static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) struct io_waitid { struct file *file; int which; pid_t upid; int options; atomic_t refs; struct wait_queue_head *head; struct siginfo __user *infop; struct waitid_info info; }; static void io_waitid_free(struct io_kiocb *req) { struct io_waitid_async *iwa = req->async_data; put_pid(iwa->wo.wo_pid); io_req_async_data_free(req); } static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; bool ret; infop = (struct compat_siginfo __user *) iw->infop; if (!user_write_access_begin(infop, sizeof(*infop))) return false; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &infop->si_code, Efault); unsafe_put_user(iw->info.pid, &infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &infop->si_uid, Efault); unsafe_put_user(iw->info.status, &infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); bool ret; if (!iw->infop) return true; if (io_is_compat(req->ctx)) return io_waitid_compat_copy_si(iw, signo); if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; unsafe_put_user(signo, &iw->infop->si_signo, Efault); unsafe_put_user(0, &iw->infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault); unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault); unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } static int io_waitid_finish(struct io_kiocb *req, int ret) { int signo = 0; if (ret > 0) { signo = SIGCHLD; ret = 0; } if (!io_waitid_copy_si(req, signo)) ret = -EFAULT; io_waitid_free(req); return ret; } static void io_waitid_remove_wq(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct wait_queue_head *head; head = smp_load_acquire(&iw->head); if (head) { struct io_waitid_async *iwa = req->async_data; smp_store_release(&iw->head, NULL); spin_lock_irq(&head->lock); list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&head->lock); } } static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); lockdep_assert_held(&req->ctx->uring_lock); hlist_del_init(&req->hash_node); io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); } static bool __io_waitid_cancel(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); lockdep_assert_held(&req->ctx->uring_lock); /* * Mark us canceled regardless of ownership. This will prevent a * potential retry from a spurious wakeup. */ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs); /* claim ownership */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return false; io_waitid_complete(req, -ECANCELED); io_req_queue_tw_complete(req, -ECANCELED); return true; } int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); if (!atomic_sub_return(1, &iw->refs)) return false; io_waitid_remove_wq(req); /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); return true; } static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_kiocb *req = tw_req.req; struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; io_tw_lock(ctx, tw); ret = __do_wait(&iwa->wo); /* * If we get -ERESTARTSYS here, we need to re-arm and check again * to ensure we get another callback. If the retry works, then we can * just remove ourselves from the waitqueue again and finish the * request. */ if (unlikely(ret == -ERESTARTSYS)) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); /* Don't retry if cancel found it meanwhile */ ret = -ECANCELED; if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) { iw->head = ¤t->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* retry armed, drop our ref */ io_waitid_drop_issue_ref(req); return; } /* fall through to complete, will kill waitqueue */ } } io_waitid_complete(req, ret); io_req_task_complete(tw_req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo); struct io_kiocb *req = iwa->req; struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct task_struct *p = key; if (!pid_child_should_wake(wo, p)) return 0; list_del_init(&wait->entry); smp_store_release(&iw->head, NULL); /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return 1; req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); return 1; } int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa; if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; iwa = io_uring_alloc_async_data(NULL, req); if (unlikely(!iwa)) return -ENOMEM; iwa->req = req; iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); iw->head = NULL; iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); return 0; } int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) goto done; /* * Mark the request as busy upfront, in case we're racing with the * wakeup. If we are, then we'll notice when we drop this initial * reference again after arming. */ atomic_set(&iw->refs, 1); /* * Cancel must hold the ctx lock, so there's no risk of cancelation * finding us until a) we remain on the list, and b) the lock is * dropped. We only need to worry about racing with the wakeup * callback. */ io_ring_submit_lock(ctx, issue_flags); /* * iw->head is valid under the ring lock, and as long as the request * is on the waitid_list where cancelations may find it. */ iw->head = ¤t->signal->wait_chldexit; hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); iwa->wo.child_wait.private = req->tctx->task; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* * Nobody else grabbed a reference, it'll complete when we get * a waitqueue callback, or if someone cancels it. */ if (!io_waitid_drop_issue_ref(req)) { io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } hlist_del_init(&req->hash_node); io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); io_ring_submit_unlock(ctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; } |
| 14 14 9 5 6 6 6 8 8 14 14 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/sched.h> #include <media/frame_vector.h> /** * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map * @write: the mapped address has write permission * @vec: structure which receives pages / pfns of the addresses mapped. * It should have space for at least nr_frames entries. * * This function maps virtual addresses from @start and fills @vec structure * with page frame numbers or page pointers to corresponding pages (choice * depends on the type of the vma underlying the virtual address). If @start * belongs to a normal vma, the function grabs reference to each of the pages * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't * touch page structures and the caller must make sure pfns aren't reused for * anything else while he is using them. * * The function returns number of pages mapped which may be less than * @nr_frames. In particular we stop mapping if there are more vmas of * different type underlying the specified range of virtual addresses. * When the function isn't able to map a single page, it returns error. * * Note that get_vaddr_frames() cannot follow VM_IO mappings. It used * to be able to do that, but that could (racily) return non-refcounted * pfns. * * This function takes care of grabbing mmap_lock as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, bool write, struct frame_vector *vec) { int ret; unsigned int gup_flags = FOLL_LONGTERM; if (nr_frames == 0) return 0; if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; start = untagged_addr(start); if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(start, nr_frames, gup_flags, (struct page **)(vec->ptrs)); vec->got_ref = true; vec->is_pfns = false; vec->nr_frames = ret; if (likely(ret > 0)) return ret; vec->nr_frames = 0; return ret ? ret : -EFAULT; } EXPORT_SYMBOL(get_vaddr_frames); /** * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired * them * @vec: frame vector to put * * Drop references to pages if get_vaddr_frames() acquired them. We also * invalidate the frame vector so that it is prepared for the next call into * get_vaddr_frames(). */ void put_vaddr_frames(struct frame_vector *vec) { struct page **pages; if (!vec->got_ref) goto out; pages = frame_vector_pages(vec); /* * frame_vector_pages() might needed to do a conversion when * get_vaddr_frames() got pages but vec was later converted to pfns. * But it shouldn't really fail to convert pfns back... */ if (WARN_ON(IS_ERR(pages))) goto out; unpin_user_pages(pages, vec->nr_frames); vec->got_ref = false; out: vec->nr_frames = 0; } EXPORT_SYMBOL(put_vaddr_frames); /** * frame_vector_to_pages - convert frame vector to contain page pointers * @vec: frame vector to convert * * Convert @vec to contain array of page pointers. If the conversion is * successful, return 0. Otherwise return an error. Note that we do not grab * page references for the page structures. */ int frame_vector_to_pages(struct frame_vector *vec) { int i; unsigned long *nums; struct page **pages; if (!vec->is_pfns) return 0; nums = frame_vector_pfns(vec); for (i = 0; i < vec->nr_frames; i++) if (!pfn_valid(nums[i])) return -EINVAL; pages = (struct page **)nums; for (i = 0; i < vec->nr_frames; i++) pages[i] = pfn_to_page(nums[i]); vec->is_pfns = false; return 0; } EXPORT_SYMBOL(frame_vector_to_pages); /** * frame_vector_to_pfns - convert frame vector to contain pfns * @vec: frame vector to convert * * Convert @vec to contain array of pfns. */ void frame_vector_to_pfns(struct frame_vector *vec) { int i; unsigned long *nums; struct page **pages; if (vec->is_pfns) return; pages = (struct page **)(vec->ptrs); nums = (unsigned long *)pages; for (i = 0; i < vec->nr_frames; i++) nums[i] = page_to_pfn(pages[i]); vec->is_pfns = true; } EXPORT_SYMBOL(frame_vector_to_pfns); /** * frame_vector_create() - allocate & initialize structure for pinned pfns * @nr_frames: number of pfns slots we should reserve * * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns * pfns. */ struct frame_vector *frame_vector_create(unsigned int nr_frames) { struct frame_vector *vec; int size = struct_size(vec, ptrs, nr_frames); if (WARN_ON_ONCE(nr_frames == 0)) return NULL; /* * This is absurdly high. It's here just to avoid strange effects when * arithmetics overflows. */ if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2)) return NULL; /* * Avoid higher order allocations, use vmalloc instead. It should * be rare anyway. */ vec = kvmalloc(size, GFP_KERNEL); if (!vec) return NULL; vec->nr_allocated = nr_frames; vec->nr_frames = 0; return vec; } EXPORT_SYMBOL(frame_vector_create); /** * frame_vector_destroy() - free memory allocated to carry frame vector * @vec: Frame vector to free * * Free structure allocated by frame_vector_create() to carry frames. */ void frame_vector_destroy(struct frame_vector *vec) { /* Make sure put_vaddr_frames() got called properly... */ VM_BUG_ON(vec->nr_frames > 0); kvfree(vec); } EXPORT_SYMBOL(frame_vector_destroy); |
| 17 17 17 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/gfp.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/resource.h> #include "siw.h" #include "siw_mem.h" /* Stag lookup is based on its index part only (24 bits). */ #define SIW_STAG_MAX_INDEX 0x00ffffff /* * siw_mem_id2obj() * * resolves memory from stag given by id. might be called from: * o process context before sending out of sgl, or * o in softirq when resolving target memory */ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) { struct siw_mem *mem; rcu_read_lock(); mem = xa_load(&sdev->mem_xa, stag_index); if (likely(mem && kref_get_unless_zero(&mem->ref))) { rcu_read_unlock(); return mem; } rcu_read_unlock(); return NULL; } void siw_umem_release(struct siw_umem *umem) { int i, num_pages = umem->num_pages; if (umem->base_mem) ib_umem_release(umem->base_mem); for (i = 0; num_pages > 0; i++) { kfree(umem->page_chunk[i].plist); num_pages -= PAGES_PER_CHUNK; } kfree(umem->page_chunk); kfree(umem); } int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, u64 start, u64 len, int rights) { struct siw_device *sdev = to_siw_dev(pd->device); struct siw_mem *mem = kzalloc_obj(*mem); struct xa_limit limit = XA_LIMIT(1, SIW_STAG_MAX_INDEX); u32 id, next; if (!mem) return -ENOMEM; mem->mem_obj = mem_obj; mem->stag_valid = 0; mem->sdev = sdev; mem->va = start; mem->len = len; mem->pd = pd; mem->perms = rights & IWARP_ACCESS_MASK; kref_init(&mem->ref); get_random_bytes(&next, 4); next &= SIW_STAG_MAX_INDEX; if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, GFP_KERNEL) < 0) { kfree(mem); return -ENOMEM; } mr->mem = mem; /* Set the STag index part */ mem->stag = id << 8; mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; return 0; } void siw_mr_drop_mem(struct siw_mr *mr) { struct siw_mem *mem = mr->mem, *found; mem->stag_valid = 0; /* make STag invalid visible asap */ smp_mb(); found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); WARN_ON(found != mem); siw_mem_put(mem); } void siw_free_mem(struct kref *ref) { struct siw_mem *mem = container_of(ref, struct siw_mem, ref); siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); if (!mem->is_mw && mem->mem_obj) { if (mem->is_pbl == 0) siw_umem_release(mem->umem); else kfree(mem->pbl); } kfree(mem); } /* * siw_check_mem() * * Check protection domain, STAG state, access permissions and * address range for memory object. * * @pd: Protection Domain memory should belong to * @mem: memory to be checked * @addr: starting addr of mem * @perms: requested access permissions * @len: len of memory interval to be checked * */ int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, enum ib_access_flags perms, int len) { if (!mem->stag_valid) { siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); return -E_STAG_INVALID; } if (mem->pd != pd) { siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); return -E_PD_MISMATCH; } /* * check access permissions */ if ((mem->perms & perms) < perms) { siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", mem->perms, perms); return -E_ACCESS_PERM; } /* * Check if access falls into valid memory interval. */ if (addr < mem->va || addr + len > mem->va + mem->len) { siw_dbg_pd(pd, "MEM interval len %d\n", len); siw_dbg_pd(pd, "[0x%p, 0x%p] out of bounds\n", (void *)(uintptr_t)addr, (void *)(uintptr_t)(addr + len)); siw_dbg_pd(pd, "[0x%p, 0x%p] STag=0x%08x\n", (void *)(uintptr_t)mem->va, (void *)(uintptr_t)(mem->va + mem->len), mem->stag); return -E_BASE_BOUNDS; } return E_ACCESS_OK; } /* * siw_check_sge() * * Check SGE for access rights in given interval * * @pd: Protection Domain memory should belong to * @sge: SGE to be checked * @mem: location of memory reference within array * @perms: requested access permissions * @off: starting offset in SGE * @len: len of memory interval to be checked * * NOTE: Function references SGE's memory object (mem->obj) * if not yet done. New reference is kept if check went ok and * released if check failed. If mem->obj is already valid, no new * lookup is being done and mem is not released it check fails. */ int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], enum ib_access_flags perms, u32 off, int len) { struct siw_device *sdev = to_siw_dev(pd->device); struct siw_mem *new = NULL; int rv = E_ACCESS_OK; if (len + off > sge->length) { rv = -E_BASE_BOUNDS; goto fail; } if (*mem == NULL) { new = siw_mem_id2obj(sdev, sge->lkey >> 8); if (unlikely(!new)) { siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); rv = -E_STAG_INVALID; goto fail; } *mem = new; } /* Check if user re-registered with different STag key */ if (unlikely((*mem)->stag != sge->lkey)) { siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); rv = -E_STAG_INVALID; goto fail; } rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); if (unlikely(rv)) goto fail; return 0; fail: if (new) { *mem = NULL; siw_mem_put(new); } return rv; } void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) { switch (op) { case SIW_OP_SEND: case SIW_OP_WRITE: case SIW_OP_SEND_WITH_IMM: case SIW_OP_SEND_REMOTE_INV: case SIW_OP_READ: case SIW_OP_READ_LOCAL_INV: if (!(wqe->sqe.flags & SIW_WQE_INLINE)) siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); break; case SIW_OP_RECEIVE: siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); break; case SIW_OP_READ_RESPONSE: siw_unref_mem_sgl(wqe->mem, 1); break; default: /* * SIW_OP_INVAL_STAG and SIW_OP_REG_MR * do not hold memory references */ break; } } int siw_invalidate_stag(struct ib_pd *pd, u32 stag) { struct siw_device *sdev = to_siw_dev(pd->device); struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); int rv = 0; if (unlikely(!mem)) { siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); return -EINVAL; } if (unlikely(mem->pd != pd)) { siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); rv = -EACCES; goto out; } /* * Per RDMA verbs definition, an STag may already be in invalid * state if invalidation is requested. So no state check here. */ mem->stag_valid = 0; siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); out: siw_mem_put(mem); return rv; } /* * Gets physical address backed by PBL element. Address is referenced * by linear byte offset into list of variably sized PB elements. * Optionally, provides remaining len within current element, and * current PBL index for later resume at same element. */ dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) { int i = idx ? *idx : 0; while (i < pbl->num_buf) { struct siw_pble *pble = &pbl->pbe[i]; if (pble->pbl_off + pble->size > off) { u64 pble_off = off - pble->pbl_off; if (len) *len = pble->size - pble_off; if (idx) *idx = i; return pble->addr + pble_off; } i++; } if (len) *len = 0; return 0; } struct siw_pbl *siw_pbl_alloc(u32 num_buf) { struct siw_pbl *pbl; if (num_buf == 0) return ERR_PTR(-EINVAL); pbl = kzalloc_flex(*pbl, pbe, num_buf); if (!pbl) return ERR_PTR(-ENOMEM); pbl->max_buf = num_buf; return pbl; } struct siw_umem *siw_umem_get(struct ib_device *base_dev, u64 start, u64 len, int rights) { struct siw_umem *umem; struct ib_umem *base_mem; struct sg_page_iter sg_iter; struct sg_table *sgt; u64 first_page_va; int num_pages, num_chunks, i, rv = 0; if (!len) return ERR_PTR(-EINVAL); first_page_va = start & PAGE_MASK; num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; num_chunks = (num_pages >> CHUNK_SHIFT) + 1; umem = kzalloc_obj(*umem); if (!umem) return ERR_PTR(-ENOMEM); umem->page_chunk = kzalloc_objs(struct siw_page_chunk, num_chunks); if (!umem->page_chunk) { rv = -ENOMEM; goto err_out; } base_mem = ib_umem_get(base_dev, start, len, rights); if (IS_ERR(base_mem)) { rv = PTR_ERR(base_mem); siw_dbg(base_dev, "Cannot pin user memory: %d\n", rv); goto err_out; } umem->fp_addr = first_page_va; umem->base_mem = base_mem; sgt = &base_mem->sgt_append.sgt; __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); if (!__sg_page_iter_next(&sg_iter)) { rv = -EINVAL; goto err_out; } for (i = 0; num_pages > 0; i++) { int nents = min_t(int, num_pages, PAGES_PER_CHUNK); struct page **plist = kzalloc_objs(struct page *, nents); if (!plist) { rv = -ENOMEM; goto err_out; } umem->page_chunk[i].plist = plist; while (nents--) { *plist = sg_page_iter_page(&sg_iter); umem->num_pages++; num_pages--; plist++; if (!__sg_page_iter_next(&sg_iter)) break; } } return umem; err_out: siw_umem_release(umem); return ERR_PTR(rv); } |
| 139 5 5 9 1 9 5 1 5 5 2 5 9 139 139 139 40 9 164 164 163 13 164 164 9 9 40 40 40 40 160 160 159 125 145 9 9 125 1 126 125 40 126 40 40 40 9 9 9 70 32 65 139 139 111 138 39 139 1 122 139 138 138 40 139 139 139 139 139 122 139 137 139 138 40 139 139 20 138 116 138 138 134 134 125 40 40 139 139 72 139 72 72 72 72 72 201 202 202 139 20 139 139 20 102 102 102 102 102 109 109 109 14 101 11 109 108 1 9 9 141 142 141 139 142 142 142 142 142 142 139 139 20 139 139 139 139 139 139 138 138 139 138 139 139 94 93 94 94 94 107 107 72 71 72 72 72 8 72 72 9 72 66 66 66 100 101 3 3 3 3 148 148 148 147 148 148 147 148 148 148 148 38 18 18 11 10 2 2 2 3 3 8 9 9 9 8 8 6942 2 17 6939 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/shmem_fs.h> #include <linux/blk-cgroup.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> #include <linux/swapfile.h> #include <linux/export.h> #include <linux/sort.h> #include <linux/completion.h> #include <linux/suspend.h> #include <linux/zswap.h> #include <linux/plist.h> #include <asm/tlbflush.h> #include <linux/leafops.h> #include <linux/swap_cgroup.h> #include "swap_table.h" #include "internal.h" #include "swap_table.h" #include "swap.h" static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags); /* * Protects the swap_info array, and the SWP_USED flag. swap_info contains * lazily allocated & freed swap device info struts, and SWP_USED indicates * which device is used, ~SWP_USED devices and can be reused. * * Also protects swap_active_head total_swap_pages, and the SWP_WRITEOK flag. */ static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available. */ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; #define DEF_SWAP_PRIO -1 unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; #endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; /* * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; static struct kmem_cache *swap_table_cachep; /* Protects si->swap_file for /proc/swaps usage */ static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); struct percpu_swap_cluster { struct swap_info_struct *si[SWAP_NR_ORDERS]; unsigned long offset[SWAP_NR_ORDERS]; local_lock_t lock; }; static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .si = { NULL }, .offset = { SWAP_ENTRY_INVALID }, .lock = INIT_LOCAL_LOCK(), }; /* May return NULL on invalid type, caller must check for NULL return */ static struct swap_info_struct *swap_type_to_info(int type) { if (type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } /* May return NULL on invalid entry, caller must check for NULL return */ static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) { return swap_type_to_info(swp_type(entry)); } /* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can * still be updated arithmetically while having special data embedded. * * inuse_pages counter is the only thing indicating if a device should * be on avail_lists or not (except swapon / swapoff). By embedding the * off-list bit in the atomic counter, updates no longer need any lock * to check the list status. * * This bit will be set if the device is not on the plist and not * usable, will be cleared if the device is on the plist. */ #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) static long swap_usage_in_pages(struct swap_info_struct *si) { return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; } /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page */ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 static bool swap_only_has_cache(struct swap_cluster_info *ci, unsigned long offset, int nr_pages) { unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; do { swp_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); if (swp_tb_get_count(swp_tb)) return false; } while (++ci_off < ci_end); return true; } /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no * folio was associated with the swap entry. */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { const swp_entry_t entry = swp_entry(si->type, offset); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; again: folio = swap_cache_get_folio(entry); if (!folio) return 0; nr_pages = folio_nr_pages(folio); ret = -nr_pages; /* * We hold a folio lock here. We have to use trylock for * avoiding deadlock. This is a special case and you should * use folio_free_swap() with explicit folio_lock() in usual * operations. */ if (!folio_trylock(folio)) goto out; /* * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked. */ if (!folio_matches_swap_entry(folio, entry)) { folio_unlock(folio); folio_put(folio); goto again; } offset = swp_offset(folio->swap); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); if (!need_reclaim || !folio_swapcache_freeable(folio)) goto out_unlock; /* * It's safe to delete the folio from swap cache only if the folio * is in swap cache with swap count == 0. The slots have no page table * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); need_reclaim = swap_only_has_cache(ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; swap_cache_del_folio(folio); folio_set_dirty(folio); ret = nr_pages; out_unlock: folio_unlock(folio); out: folio_put(folio); return ret; } static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); return rb_entry(rb, struct swap_extent, rb_node); } static inline struct swap_extent *next_se(struct swap_extent *se) { struct rb_node *rb = rb_next(&se->rb_node); return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; } /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; sector_t start_block; sector_t nr_blocks; int err = 0; /* Do not discard the swap header page! */ se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); } for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */ } static struct swap_extent * offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) { struct swap_extent *se; struct rb_node *rb; rb = sis->swap_extent_root.rb_node; while (rb) { se = rb_entry(rb, struct swap_extent, rb_node); if (offset < se->start_page) rb = rb->rb_left; else if (offset >= se->start_page + se->nr_pages) rb = rb->rb_right; else return se; } /* It *must* be present */ BUG(); } sector_t swap_folio_sector(struct folio *folio) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; offset = swp_offset(folio->swap); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); } /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */ static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO)) break; se = next_se(se); } } #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) { return info->count == 0; } static inline bool cluster_is_discard(struct swap_cluster_info *info) { return info->flags == CLUSTER_FLAG_DISCARD; } static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) { return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); } static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; if (!cluster_table_is_alloced(ci)) return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; } static inline unsigned int cluster_index(struct swap_info_struct *si, struct swap_cluster_info *ci) { return ci - si->cluster_info; } static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } static struct swap_table *swap_table_alloc(gfp_t gfp) { struct folio *folio; if (!SWP_TABLE_USE_PAGE) return kmem_cache_zalloc(swap_table_cachep, gfp); folio = folio_alloc(gfp | __GFP_ZERO, 0); if (folio) return folio_address(folio); return NULL; } static void swap_table_free_folio_rcu_cb(struct rcu_head *head) { struct folio *folio; folio = page_folio(container_of(head, struct page, rcu_head)); folio_put(folio); } static void swap_table_free(struct swap_table *table) { if (!SWP_TABLE_USE_PAGE) { kmem_cache_free(swap_table_cachep, table); return; } call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), swap_table_free_folio_rcu_cb); } /* * Sanity check to ensure nothing leaked, and the specified range is empty. * One special case is that bad slots can't be freed, so check the number of * bad slots for swapoff, and non-swapoff path must never free bad slots. */ static void swap_cluster_assert_empty(struct swap_cluster_info *ci, unsigned int ci_off, unsigned int nr, bool swapoff) { unsigned int ci_end = ci_off + nr; unsigned long swp_tb; int bad_slots = 0; if (!IS_ENABLED(CONFIG_DEBUG_VM) && !swapoff) return; do { swp_tb = __swap_table_get(ci, ci_off); if (swp_tb_is_bad(swp_tb)) bad_slots++; else WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); } while (++ci_off < ci_end); WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); } static void swap_cluster_free_table(struct swap_cluster_info *ci) { struct swap_table *table; /* Only empty cluster's table is allow to be freed */ lockdep_assert_held(&ci->lock); table = (void *)rcu_dereference_protected(ci->table, true); rcu_assign_pointer(ci->table, NULL); swap_table_free(table); } /* * Allocate swap table for one cluster. Attempt an atomic allocation first, * then fallback to sleeping allocation. */ static struct swap_cluster_info * swap_cluster_alloc_table(struct swap_info_struct *si, struct swap_cluster_info *ci) { struct swap_table *table; /* * Only cluster isolation from the allocator does table allocation. * Swap allocator uses percpu clusters and holds the local lock. */ lockdep_assert_held(&ci->lock); lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); /* The cluster must be free and was just isolated from the free list. */ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); if (table) { rcu_assign_pointer(ci->table, table); return ci; } /* * Try a sleep allocation. Each isolated free cluster may cause * a sleep allocation, but there is a limited number of them, so * the potential recursive allocation is limited. */ spin_unlock(&ci->lock); if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); local_unlock(&percpu_swap_cluster.lock); table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); /* * Back to atomic context. We might have migrated to a new CPU with a * usable percpu cluster. But just keep using the isolated cluster to * make things easier. Migration indicates a slight change of workload * so using a new free cluster might not be a bad idea, and the worst * could happen with ignoring the percpu cluster is fragmentation, * which is acceptable since this fallback and race is rare. */ local_lock(&percpu_swap_cluster.lock); if (!(si->flags & SWP_SOLIDSTATE)) spin_lock(&si->global_cluster_lock); spin_lock(&ci->lock); /* Nothing except this helper should touch a dangling empty cluster. */ if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { if (table) swap_table_free(table); return ci; } if (!table) { move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); spin_unlock(&ci->lock); return NULL; } rcu_assign_pointer(ci->table, table); return ci; } static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) { VM_WARN_ON(ci->flags == new_flags); BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); lockdep_assert_held(&ci->lock); spin_lock(&si->lock); if (ci->flags == CLUSTER_FLAG_NONE) list_add_tail(&ci->list, list); else list_move_tail(&ci->list, list); spin_unlock(&si->lock); ci->flags = new_flags; } /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, false); swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } /* * Isolate and lock the first cluster that is not contented on a list, * clean its flag before taken off-list. Cluster flag must be in sync * with list status, so cluster updaters can always know the cluster * list status without touching si lock. * * Note it's possible that all clusters on a list are contented so * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *found = NULL; spin_lock(&si->lock); list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; /* We may only isolate and clear flags of following lists */ VM_BUG_ON(!ci->flags); VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && ci->flags != CLUSTER_FLAG_FULL); list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; found = ci; break; } spin_unlock(&si->lock); if (found && !cluster_table_is_alloced(found)) { /* Only an empty free cluster's swap table can be freed. */ VM_WARN_ON_ONCE(list != &si->free_clusters); VM_WARN_ON_ONCE(!cluster_is_empty(found)); return swap_cluster_alloc_table(si, found); } return found; } /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. Discard cluster is a bit special as * they don't participate in allocation or reclaim, so clusters marked as * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. */ static bool swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; bool ret = false; unsigned int idx; spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); /* * Delete the cluster from list to prepare for discard, but keep * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be * pointing to it, or ran into by relocate_cluster. */ list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&ci->lock); /* * Discard is done, clear its flags as it's off-list, then * return the cluster to allocation list. */ ci->flags = CLUSTER_FLAG_NONE; __free_cluster(si, ci); spin_unlock(&ci->lock); ret = true; spin_lock(&si->lock); } spin_unlock(&si->lock); return ret; } static void swap_discard_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, discard_work); swap_do_scheduled_discard(si); } static void swap_users_ref_free(struct percpu_ref *ref) { struct swap_info_struct *si; si = container_of(ref, struct swap_info_struct, users); complete(&si->comp); } /* * Must be called after freeing if ci->count == 0, moves the cluster to free * or discard list. */ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); lockdep_assert_held(&ci->lock); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard. */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(si, ci); return; } __free_cluster(si, ci); } /* * Must be called after freeing if ci->count != 0, moves the cluster to * nonfull list. */ static void partial_free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); lockdep_assert_held(&ci->lock); if (ci->flags != CLUSTER_FLAG_NONFULL) move_cluster(si, ci, &si->nonfull_clusters[ci->order], CLUSTER_FLAG_NONFULL); } /* * Must be called after allocation, moves the cluster to full or frag list. * Note: allocation doesn't acquire si lock, and may drop the ci lock for * reclaim, so the cluster could be any where when called. */ static void relocate_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { lockdep_assert_held(&ci->lock); /* Discard cluster must remain off-list or on discard list */ if (cluster_is_discard(ci)) return; if (!ci->count) { if (ci->flags != CLUSTER_FLAG_FREE) free_cluster(si, ci); } else if (ci->count != SWAPFILE_CLUSTER) { if (ci->flags != CLUSTER_FLAG_FRAG) move_cluster(si, ci, &si->frag_clusters[ci->order], CLUSTER_FLAG_FRAG); } else { if (ci->flags != CLUSTER_FLAG_FULL) move_cluster(si, ci, &si->full_clusters, CLUSTER_FLAG_FULL); } } /* * The cluster corresponding to @offset will be accounted as having one bad * slot. The cluster will not be added to the free cluster list, and its * usage counter will be increased by 1. Only used for initialization. */ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned int offset, bool mask) { unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; struct swap_table *table; int ret = 0; /* si->max may got shrunk by swap swap_activate() */ if (offset >= si->max && !mask) { pr_debug("Ignoring bad slot %u (max: %u)\n", offset, si->max); return 0; } /* * Account it, skip header slot: si->pages is initiated as * si->max - 1. Also skip the masking of last cluster, * si->pages doesn't include that part. */ if (offset && !mask) si->pages -= 1; if (!si->pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } ci = cluster_info + idx; if (!ci->table) { table = swap_table_alloc(GFP_KERNEL); if (!table) return -ENOMEM; rcu_assign_pointer(ci->table, table); } spin_lock(&ci->lock); /* Check for duplicated bad swap slots. */ if (__swap_table_xchg(ci, ci_off, SWP_TB_BAD) != SWP_TB_NULL) { pr_warn("Duplicated bad slot offset %d\n", offset); ret = -EINVAL; } else { ci->count++; } spin_unlock(&ci->lock); WARN_ON(ci->count > SWAPFILE_CLUSTER); WARN_ON(ci->flags); return ret; } /* * Reclaim drops the ci lock, so the cluster may become unusable (freed or * stolen by a lower order). @usable will be set to false if that happens. */ static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned int order, bool *usable) { unsigned int nr_pages = 1 << order; unsigned long offset = start, end = start + nr_pages; unsigned long swp_tb; spin_unlock(&ci->lock); do { swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (swp_tb_get_count(swp_tb)) break; if (swp_tb_is_folio(swp_tb)) if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) break; } while (++offset < end); spin_lock(&ci->lock); /* * We just dropped ci->lock so cluster could be used by another * order or got freed, check if it's still usable or empty. */ if (!cluster_is_usable(ci, order)) { *usable = false; return false; } *usable = true; /* Fast path, no need to scan if the whole cluster is empty */ if (cluster_is_empty(ci)) return true; /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. */ for (offset = start; offset < end; offset++) { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (!swp_tb_is_null(swp_tb)) return false; } return true; } static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long offset, unsigned int nr_pages, bool *need_reclaim) { unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; do { swp_tb = __swap_table_get(ci, ci_off); if (swp_tb_is_null(swp_tb)) continue; if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { if (!vm_swap_full()) return false; *need_reclaim = true; continue; } /* Slot with zero count can only be NULL or folio */ VM_WARN_ON(!swp_tb_get_count(swp_tb)); return false; } while (++ci_off < ci_end); return true; } static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, struct swap_cluster_info *ci, struct folio *folio, unsigned int ci_off) { unsigned int order; unsigned long nr_pages; lockdep_assert_held(&ci->lock); if (!(si->flags & SWP_WRITEOK)) return false; /* * All mm swap allocation starts with a folio (folio_alloc_swap), * it's also the only allocation path for large orders allocation. * Such swap slots starts with count == 0 and will be increased * upon folio unmap. * * Else, it's a exclusive order 0 allocation for hibernation. * The slot starts with count == 1 and never increases. */ if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; swap_cluster_assert_empty(ci, ci_off, nr_pages, false); __swap_cache_add_folio(ci, folio, swp_entry(si->type, ci_off + cluster_offset(si, ci))); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; nr_pages = 1; swap_cluster_assert_empty(ci, ci_off, 1, false); /* Sets a fake shadow as placeholder */ __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); return false; } /* * The first allocation in a cluster makes the * cluster exclusive to this order */ if (cluster_is_empty(ci)) ci->order = order; ci->count += nr_pages; swap_range_alloc(si, nr_pages); return true; } /* Try use a new cluster for current CPU and allocate from it. */ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct folio *folio, unsigned long offset) { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned long end = start + SWAPFILE_CLUSTER; unsigned int nr_pages = 1 << order; bool need_reclaim, ret, usable; lockdep_assert_held(&ci->lock); VM_WARN_ON(!cluster_is_usable(ci, order)); if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) goto out; for (end -= nr_pages; offset <= end; offset += nr_pages) { need_reclaim = false; if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) continue; if (need_reclaim) { ret = cluster_reclaim_range(si, ci, offset, order, &usable); if (!usable) goto out; if (cluster_is_empty(ci)) offset = start; /* Reclaim failed but cluster is usable, try next */ if (!ret) continue; } if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER)) break; found = offset; offset += nr_pages; if (ci->count < SWAPFILE_CLUSTER && offset <= end) next = offset; break; } out: relocate_cluster(si, ci); swap_cluster_unlock(ci); if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); } else { si->global_cluster->next[order] = next; } return found; } static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, struct list_head *list, struct folio *folio, bool scan_all) { unsigned int found = SWAP_ENTRY_INVALID; do { struct swap_cluster_info *ci = isolate_lock_cluster(si, list); unsigned long offset; if (!ci) break; offset = cluster_offset(si, ci); found = alloc_swap_scan_cluster(si, ci, folio, offset); if (found) break; } while (scan_all); return found; } static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; unsigned long swp_tb; int nr_reclaim; if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; while (offset < end) { swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&ci->lock); if (nr_reclaim) { offset += abs(nr_reclaim); continue; } } offset++; } /* in case no swap cache is reclaimed */ if (ci->flags == CLUSTER_FLAG_NONE) relocate_cluster(si, ci); swap_cluster_unlock(ci); if (to_scan <= 0) break; } } static void swap_reclaim_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, reclaim_work); swap_reclaim_full_clusters(si, true); } /* * Try to allocate swap entries with specified order and try set a new * cluster for current CPU too. */ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, struct folio *folio) { struct swap_cluster_info *ci; unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; /* * Swapfile is not block device so unable * to allocate large entries. */ if (order && !(si->flags & SWP_BLKDEV)) return 0; if (!(si->flags & SWP_SOLIDSTATE)) { /* Serialize HDD SWAP allocation for each device. */ spin_lock(&si->global_cluster_lock); offset = si->global_cluster->next[order]; if (offset == SWAP_ENTRY_INVALID) goto new_cluster; ci = swap_cluster_lock(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); found = alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } if (found) goto done; } new_cluster: /* * If the device need discard, prefer new cluster over nonfull * to spread out the writes. */ if (si->flags & SWP_PAGE_DISCARD) { found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } if (order < PMD_ORDER) { found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true); if (found) goto done; } if (!(si->flags & SWP_PAGE_DISCARD)) { found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } /* Try reclaim full clusters if free and nonfull lists are drained */ if (vm_swap_full()) swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { /* * Scan only one fragment cluster is good enough. Order 0 * allocation will surely success, and large allocation * failure is not critical. Scanning one cluster still * keeps the list rotated and reclaimed (for clean swap cache). */ found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false); if (found) goto done; } if (order) goto done; /* Order 0 stealing from higher order */ for (int o = 1; o < SWAP_NR_ORDERS; o++) { /* * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true); if (found) goto done; found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true); if (found) goto done; } done: if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); return found; } /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { unsigned long pages; spin_lock(&swap_avail_lock); if (swapoff) { /* * Forcefully remove it. Clear the SWP_WRITEOK flags for * swapoff here so it's synchronized by both si->lock and * swap_avail_lock, to ensure the result can be seen by * add_to_avail_list. */ lockdep_assert_held(&si->lock); si->flags &= ~SWP_WRITEOK; atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); } else { /* * If not called by swapoff, take it off-list only if it's * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly * si->inuse_pages == pages), any concurrent slot freeing, * or device already removed from plist by someone else * will make this return false. */ pages = si->pages; if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, pages | SWAP_USAGE_OFFLIST_BIT)) goto skip; } plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); } /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { long val; unsigned long pages; spin_lock(&swap_avail_lock); /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ if (swapon) { lockdep_assert_held(&si->lock); si->flags |= SWP_WRITEOK; } else { if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) goto skip; } if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) goto skip; val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); /* * When device is full and device is on the plist, only one updater will * see (inuse_pages == si->pages) and will call del_from_avail_list. If * that updater happen to be here, just skip adding. */ pages = si->pages; if (val == pages) { /* Just like the cmpxchg in del_from_avail_list */ if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, pages | SWAP_USAGE_OFFLIST_BIT)) goto skip; } plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); } /* * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock * within each cluster, so the total contribution to the global counter should * always be positive and cannot exceed the total number of usable slots. */ static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) { long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); /* * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, * remove it from the plist. */ if (unlikely(val == si->pages)) { del_from_avail_list(si, false); return true; } return false; } static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) { long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); /* * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, * add it to the plist. */ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) add_to_avail_list(si, false); } static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries) { if (swap_usage_add(si, nr_entries)) { if (vm_swap_full()) schedule_work(&si->reclaim_work); } atomic_long_sub(nr_entries, &nr_swap_pages); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; /* * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); zswap_invalidate(swp_entry(si->type, offset + i)); } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; else swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 * only after the above cleanups are done. */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); swap_usage_sub(si, nr_entries); } static bool get_swap_device_info(struct swap_info_struct *si) { if (!percpu_ref_tryget_live(&si->users)) return false; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is * up to dated. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(), and smp_wmb() in swapoff. */ smp_rmb(); return true; } /* * Fast path try to get swap entries with specified order from current * CPU's swap entry pool (a cluster). */ static bool swap_alloc_fast(struct folio *folio) { unsigned int order = folio_order(folio); struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned int offset; /* * Once allocated, swap_info_struct will never be completely freed, * so checking it's liveness by get_swap_device_info is enough. */ si = this_cpu_read(percpu_swap_cluster.si[order]); offset = this_cpu_read(percpu_swap_cluster.offset[order]); if (!si || !offset || !get_swap_device_info(si)) return false; ci = swap_cluster_lock(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } put_swap_device(si); return folio_test_swapcache(folio); } /* Rotate the device and switch to a new cluster */ static void swap_alloc_slow(struct folio *folio) { struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); start_over: plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { cluster_alloc_swap_entry(si, folio); put_swap_device(si); if (folio_test_swapcache(folio)) return; if (folio_test_large(folio)) return; } spin_lock(&swap_avail_lock); /* * if we got here, it's likely that si was almost full before, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock. * Since we dropped the swap_avail_lock, the swap_avail_list * may have been modified; so if next is still in the * swap_avail_head list then try it, otherwise start over if we * have not gotten any slots. */ if (plist_node_empty(&next->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); } /* * Discard pending clusters in a synchronized way when under high pressure. * Return: true if any cluster is discarded. */ static bool swap_sync_discard(void) { bool ret = false; struct swap_info_struct *si, *next; spin_lock(&swap_lock); start_over: plist_for_each_entry_safe(si, next, &swap_active_head, list) { spin_unlock(&swap_lock); if (get_swap_device_info(si)) { if (si->flags & SWP_PAGE_DISCARD) ret = swap_do_scheduled_discard(si); put_swap_device(si); } if (ret) return true; spin_lock(&swap_lock); if (plist_node_empty(&next->list)) goto start_over; } spin_unlock(&swap_lock); return false; } static int swap_extend_table_alloc(struct swap_info_struct *si, struct swap_cluster_info *ci, gfp_t gfp) { void *table; table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); if (!table) return -ENOMEM; spin_lock(&ci->lock); if (!ci->extend_table) ci->extend_table = table; else kfree(table); spin_unlock(&ci->lock); return 0; } int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) { int ret; struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); si = get_swap_device(entry); if (!si) return 0; ci = __swap_offset_to_cluster(si, offset); ret = swap_extend_table_alloc(si, ci, gfp); put_swap_device(si); return ret; } static void swap_extend_table_try_free(struct swap_cluster_info *ci) { unsigned long i; bool can_free = true; if (!ci->extend_table) return; for (i = 0; i < SWAPFILE_CLUSTER; i++) { if (ci->extend_table[i]) can_free = false; } if (can_free) { kfree(ci->extend_table); ci->extend_table = NULL; } } /* Decrease the swap count of one slot, without freeing it */ static void __swap_cluster_put_entry(struct swap_cluster_info *ci, unsigned int ci_off) { int count; unsigned long swp_tb; lockdep_assert_held(&ci->lock); swp_tb = __swap_table_get(ci, ci_off); count = __swp_tb_get_count(swp_tb); VM_WARN_ON_ONCE(count <= 0); VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX); if (count == SWP_TB_COUNT_MAX) { count = ci->extend_table[ci_off]; /* Overflow starts with SWP_TB_COUNT_MAX */ VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX); count--; if (count == (SWP_TB_COUNT_MAX - 1)) { ci->extend_table[ci_off] = 0; __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); swap_extend_table_try_free(ci); } else { ci->extend_table[ci_off] = count; } } else { __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); } } /** * swap_put_entries_cluster - Decrease the swap count of slots within one cluster * @si: The swap device. * @offset: start offset of slots. * @nr: number of slots. * @reclaim_cache: if true, also reclaim the swap cache if slots are freed. * * This helper decreases the swap count of a set of slots and tries to * batch free them. Also reclaims the swap cache if @reclaim_cache is true. * * Context: The specified slots must be pinned by existing swap count or swap * cache reference, so they won't be released until this helper returns. */ static void swap_put_entries_cluster(struct swap_info_struct *si, pgoff_t offset, int nr, bool reclaim_cache) { struct swap_cluster_info *ci; unsigned int ci_off, ci_end; pgoff_t end = offset + nr; bool need_reclaim = false; unsigned int nr_reclaimed; unsigned long swp_tb; int ci_batch = -1; ci = swap_cluster_lock(si, offset); ci_off = offset % SWAPFILE_CLUSTER; ci_end = ci_off + nr; do { swp_tb = __swap_table_get(ci, ci_off); if (swp_tb_get_count(swp_tb) == 1) { /* count == 1 and non-cached slots will be batch freed. */ if (!swp_tb_is_folio(swp_tb)) { if (ci_batch == -1) ci_batch = ci_off; continue; } /* count will be 0 after put, slot can be reclaimed */ need_reclaim = true; } /* * A count != 1 or cached slot can't be freed. Put its swap * count and then free the interrupted pending batch. Cached * slots will be freed when folio is removed from swap cache * (__swap_cache_del_folio). */ __swap_cluster_put_entry(ci, ci_off); if (ci_batch != -1) { __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); ci_batch = -1; } } while (++ci_off < ci_end); if (ci_batch != -1) __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); swap_cluster_unlock(ci); if (!need_reclaim || !reclaim_cache) return; do { nr_reclaimed = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); offset++; if (nr_reclaimed) offset = round_up(offset, abs(nr_reclaimed)); } while (offset < end); } /* Increase the swap count of one slot. */ static int __swap_cluster_dup_entry(struct swap_cluster_info *ci, unsigned int ci_off) { int count; unsigned long swp_tb; lockdep_assert_held(&ci->lock); swp_tb = __swap_table_get(ci, ci_off); /* Bad or special slots can't be handled */ if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb))) return -EINVAL; count = __swp_tb_get_count(swp_tb); /* Must be either cached or have a count already */ if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb))) return -ENOENT; if (likely(count < (SWP_TB_COUNT_MAX - 1))) { __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1)); VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]); } else if (count == (SWP_TB_COUNT_MAX - 1)) { if (ci->extend_table) { VM_WARN_ON_ONCE(ci->extend_table[ci_off]); ci->extend_table[ci_off] = SWP_TB_COUNT_MAX; __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX)); } else { return -ENOMEM; } } else if (count == SWP_TB_COUNT_MAX) { VM_WARN_ON_ONCE(ci->extend_table[ci_off] >= type_max(typeof(ci->extend_table[0]))); ++ci->extend_table[ci_off]; } else { /* Never happens unless counting went wrong */ WARN_ON_ONCE(1); } return 0; } /** * swap_dup_entries_cluster: Increase the swap count of slots within one cluster. * @si: The swap device. * @offset: start offset of slots. * @nr: number of slots. * * Context: The specified slots must be pinned by existing swap count or swap * cache reference, so they won't be released until this helper returns. * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX) * and failed to allocate an extended table, -EINVAL if any entry is bad entry. */ static int swap_dup_entries_cluster(struct swap_info_struct *si, pgoff_t offset, int nr) { int err; struct swap_cluster_info *ci; unsigned int ci_start, ci_off, ci_end; ci_start = offset % SWAPFILE_CLUSTER; ci_end = ci_start + nr; ci_off = ci_start; ci = swap_cluster_lock(si, offset); restart: do { err = __swap_cluster_dup_entry(ci, ci_off); if (unlikely(err)) { if (err == -ENOMEM) { spin_unlock(&ci->lock); err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); spin_lock(&ci->lock); if (!err) goto restart; } goto failed; } } while (++ci_off < ci_end); swap_cluster_unlock(ci); return 0; failed: while (ci_off-- > ci_start) __swap_cluster_put_entry(ci, ci_off); swap_extend_table_try_free(ci); swap_cluster_unlock(ci); return err; } /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap * * Allocate swap space for the folio and add the folio to the * swap cache. * * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); if (order) { /* * Reject large allocation when THP_SWAP is disabled, * the caller should split the folio and try again. */ if (!IS_ENABLED(CONFIG_THP_SWAP)) return -EAGAIN; /* * Allocation size should never exceed cluster size * (HPAGE_PMD_SIZE). */ if (size > SWAPFILE_CLUSTER) { VM_WARN_ON_ONCE(1); return -EINVAL; } } again: local_lock(&percpu_swap_cluster.lock); if (!swap_alloc_fast(folio)) swap_alloc_slow(folio); local_unlock(&percpu_swap_cluster.lock); if (!order && unlikely(!folio_test_swapcache(folio))) { if (swap_sync_discard()) goto again; } /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) swap_cache_del_folio(folio); if (unlikely(!folio_test_swapcache(folio))) return -ENOMEM; return 0; } /** * folio_dup_swap() - Increase swap count of swap entries of a folio. * @folio: folio with swap entries bounded. * @subpage: if not NULL, only increase the swap count of this subpage. * * Typically called when the folio is unmapped and have its swap entry to * take its place: Swap entries allocated to a folio has count == 0 and pinned * by swap cache. The swap cache pin doesn't increase the swap count. This * helper sets the initial count == 1 and increases the count as the folio is * unmapped and swap entries referencing the slots are generated to replace * the folio. * * Context: Caller must ensure the folio is locked and in the swap cache. * NOTE: The caller also has to ensure there is no raced call to * swap_put_entries_direct on its swap entry before this helper returns, or * the swap count may underflow. */ int folio_dup_swap(struct folio *folio, struct page *subpage) { swp_entry_t entry = folio->swap; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); if (subpage) { entry.val += folio_page_idx(folio, subpage); nr_pages = 1; } return swap_dup_entries_cluster(swap_entry_to_info(entry), swp_offset(entry), nr_pages); } /** * folio_put_swap() - Decrease swap count of swap entries of a folio. * @folio: folio with swap entries bounded, must be in swap cache and locked. * @subpage: if not NULL, only decrease the swap count of this subpage. * * This won't free the swap slots even if swap count drops to zero, they are * still pinned by the swap cache. User may call folio_free_swap to free them. * Context: Caller must ensure the folio is locked and in the swap cache. */ void folio_put_swap(struct folio *folio, struct page *subpage) { swp_entry_t entry = folio->swap; unsigned long nr_pages = folio_nr_pages(folio); struct swap_info_struct *si = __swap_entry_to_info(entry); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); if (subpage) { entry.val += folio_page_idx(folio, subpage); nr_pages = 1; } swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); } /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU * reader side is locked, etc., the swap entry may become invalid * because of swapoff. Then, we need to enclose all swap related * functions with get_swap_device() and put_swap_device(), unless the * swap functions call get/put_swap_device() by themselves. * * RCU reader side lock (including any spinlock) is sufficient to * prevent swapoff, because synchronize_rcu() is called in swapoff() * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff. The caller must be prepared for that. For * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * swap_cache_alloc_folio() * swap_cache_add_folio() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) goto out; offset = swp_offset(entry); if (offset >= si->max) goto put_out; return si; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; put_out: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } /* * Free a set of swap slots after their swap count dropped to zero, or will be * zero after putting the last ref (saves one __swap_cluster_put_entry call). */ void __swap_cluster_free_entries(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int ci_start, unsigned int nr_pages) { unsigned long old_tb; unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; unsigned long offset = cluster_offset(si, ci) + ci_start; VM_WARN_ON(ci->count < nr_pages); ci->count -= nr_pages; do { old_tb = __swap_table_get(ci, ci_off); /* Release the last ref, or after swap cache is dropped */ VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); __swap_table_set(ci, ci_off, null_to_swp_tb()); } while (++ci_off < ci_end); mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); swap_range_free(si, offset, nr_pages); swap_cluster_assert_empty(ci, ci_start, nr_pages, false); if (!ci->count) free_cluster(si, ci); else partial_free_cluster(si, ci); } int __swap_count(swp_entry_t entry) { struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); unsigned int ci_off = swp_cluster_offset(entry); return swp_tb_get_count(__swap_table_get(ci, ci_off)); } /** * swap_entry_swapped - Check if the swap entry is swapped. * @si: the swap device. * @entry: the swap entry. */ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; unsigned long swp_tb; ci = swap_cluster_lock(si, offset); swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); swap_cluster_unlock(ci); return swp_tb_get_count(swp_tb) > 0; } /* * How many references to @entry are currently swapped out? * This returns exact answer. */ int swp_swapcount(swp_entry_t entry) { struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long swp_tb; int count; si = get_swap_device(entry); if (!si) return 0; ci = swap_cluster_lock(si, swp_offset(entry)); swp_tb = __swap_table_get(ci, swp_cluster_offset(entry)); count = swp_tb_get_count(swp_tb); if (count == SWP_TB_COUNT_MAX) count = ci->extend_table[swp_cluster_offset(entry)]; swap_cluster_unlock(ci); put_swap_device(si); return count < 0 ? 0 : count; } /* * folio_maybe_swapped - Test if a folio covers any swap slot with count > 0. * * Check if a folio is swapped. Holding the folio lock ensures the folio won't * go from not-swapped to swapped because the initial swap count increment can * only be done by folio_dup_swap, which also locks the folio. But a concurrent * decrease of swap count is possible through swap_put_entries_direct, so this * may return a false positive. * * Context: Caller must ensure the folio is locked and in the swap cache. */ static bool folio_maybe_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; struct swap_cluster_info *ci; unsigned int ci_off, ci_end; bool ret = false; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); ci = __swap_entry_to_cluster(entry); ci_off = swp_cluster_offset(entry); ci_end = ci_off + folio_nr_pages(folio); /* * Extra locking not needed, folio lock ensures its swap entries * won't be released, the backing data won't be gone either. */ rcu_read_lock(); do { if (__swp_tb_get_count(__swap_table_get(ci, ci_off))) { ret = true; break; } } while (++ci_off < ci_end); rcu_read_unlock(); return ret; } static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) return false; if (folio_test_writeback(folio)) return false; /* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a folio which has already been recorded in the * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) return false; return true; } /** * folio_free_swap() - Free the swap space used for this folio. * @folio: The folio to remove. * * If swap is getting full, or if there are no more mappings of this folio, * then call folio_free_swap to free its swap space. * * Return: true if we were able to release the swap space. */ bool folio_free_swap(struct folio *folio) { if (!folio_swapcache_freeable(folio)) return false; if (folio_maybe_swapped(folio)) return false; swap_cache_del_folio(folio); folio_set_dirty(folio); return true; } /** * swap_put_entries_direct() - Release reference on range of swap entries and * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). * * Context: Caller must ensure there is no race condition on the reference * owner. e.g., locking the PTL of a PTE containing the entry being released. */ void swap_put_entries_direct(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; unsigned long offset, cluster_end; struct swap_info_struct *si; si = get_swap_device(entry); if (WARN_ON_ONCE(!si)) return; if (WARN_ON_ONCE(end_offset > si->max)) goto out; /* Put entries and reclaim cache in each cluster */ offset = start_offset; do { cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset); swap_put_entries_cluster(si, offset, cluster_end - offset, true); offset = cluster_end; } while (offset < end_offset); out: put_swap_device(si); } #ifdef CONFIG_HIBERNATION /* Allocate a slot for hibernation */ swp_entry_t swap_alloc_hibernation_slot(int type) { struct swap_info_struct *pcp_si, *si = swap_type_to_info(type); unsigned long pcp_offset, offset = SWAP_ENTRY_INVALID; struct swap_cluster_info *ci; swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { /* * Try the local cluster first if it matches the device. If * not, try grab a new cluster and override local cluster. */ local_lock(&percpu_swap_cluster.lock); pcp_si = this_cpu_read(percpu_swap_cluster.si[0]); pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]); if (pcp_si == si && pcp_offset) { ci = swap_cluster_lock(si, pcp_offset); if (cluster_is_usable(ci, 0)) offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); else swap_cluster_unlock(ci); } if (!offset) offset = cluster_alloc_swap_entry(si, NULL); local_unlock(&percpu_swap_cluster.lock); if (offset) entry = swp_entry(si->type, offset); } put_swap_device(si); } fail: return entry; } /* Free a slot allocated by swap_alloc_hibernation_slot */ void swap_free_hibernation_slot(swp_entry_t entry) { struct swap_info_struct *si; struct swap_cluster_info *ci; pgoff_t offset = swp_offset(entry); si = get_swap_device(entry); if (WARN_ON(!si)) return; ci = swap_cluster_lock(si, offset); __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); swap_cluster_unlock(ci); /* In theory readahead might add it to the swap cache by accident */ __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); put_swap_device(si); } /* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). */ int swap_type_of(dev_t device, sector_t offset) { int type; if (!device) return -1; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); if (se->start_block == offset) { spin_unlock(&swap_lock); return type; } } } spin_unlock(&swap_lock); return -ENODEV; } int find_first_swap(dev_t *device) { int type; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; *device = sis->bdev->bd_dev; spin_unlock(&swap_lock); return type; } spin_unlock(&swap_lock); return -ENODEV; } /* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ sector_t swapdev_block(int type, pgoff_t offset) { struct swap_info_struct *si = swap_type_to_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page); } /* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; spin_lock(&swap_lock); if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= swap_usage_in_pages(sis); } spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct folio *folio) { struct page *page; struct folio *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; bool hwpoisoned = false; int ret = 1; /* * If the folio is removed from swap cache by others, continue to * unuse other PTEs. try_to_unuse may try again if we missed this one. */ if (!folio_matches_swap_entry(folio, entry)) return 0; swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) return -ENOMEM; else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { hwpoisoned = true; folio = swapcache; } page = folio_file_page(folio, swp_offset(entry)); if (PageHWPoison(page)) hwpoisoned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), swp_entry_to_pte(entry)))) { ret = 0; goto out; } old_pte = ptep_get(pte); if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); if (hwpoisoned) { swp_entry = make_hwpoison_entry(page); } else { swp_entry = make_poisoned_swp_entry(); } new_pte = swp_entry_to_pte(swp_entry); ret = 0; goto setpte; } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_get(folio); if (folio == swapcache) { rmap_t rmap_flags = RMAP_NONE; /* * See do_swap_page(): writeback would be problematic. * However, we do a folio_wait_writeback() just before this * call and have the folio locked. */ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; /* * We currently only expect small !anon folios, which are either * fully exclusive or fully shared. If we ever get large folios * here, we have to be careful. */ if (!folio_test_anon(folio)) { VM_WARN_ON_ONCE(folio_test_large(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); } else { folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } } else { /* ksm created a completely new copy */ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry))); out: if (pte) pte_unmap_unlock(pte, ptl); if (folio != swapcache) { folio_unlock(folio); folio_put(folio); } return ret; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { pte_t *pte = NULL; do { struct folio *folio; unsigned long swp_tb; softleaf_t entry; int ret; pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); if (!pte) break; } ptent = ptep_get_lockless(pte); entry = softleaf_from_pte(ptent); if (!softleaf_is_swap(entry)) continue; if (swp_type(entry) != type) continue; pte_unmap(pte); pte = NULL; folio = swap_cache_get_folio(entry); if (!folio) { struct vm_fault vmf = { .vma = vma, .address = addr, .real_address = addr, .pmd = pmd, }; folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } if (!folio) { swp_tb = swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); if (swp_tb_get_count(swp_tb) <= 0) continue; return -ENOMEM; } folio_lock(folio); folio_wait_writeback(folio); ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } while (addr += PAGE_SIZE, addr != end); if (pte) pte_unmap(pte); return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, unsigned int type) { pmd_t *pmd; unsigned long next; int ret; pmd = pmd_offset(pud, addr); do { cond_resched(); next = pmd_addr_end(addr, end); ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0; } static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned int type) { pud_t *pud; unsigned long next; int ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0; } static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned int type) { p4d_t *p4d; unsigned long next; int ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; int ret; addr = vma->vm_start; end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); if (check_stable_address_space(mm)) goto unlock; for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); if (ret) break; } cond_resched(); } unlock: mmap_read_unlock(mm); return ret; } /* * Scan swap table from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; unsigned long swp_tb; /* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), i % SWAPFILE_CLUSTER); if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } if (i == si->max) i = 0; return i; } static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; struct folio *folio; swp_entry_t entry; unsigned int i; if (!swap_usage_in_pages(si)) goto success; retry: retval = shmem_unuse(type); if (retval) return retval; prev_mm = &init_mm; mmget(prev_mm); spin_lock(&mmlist_lock); p = &init_mm.mmlist; while (swap_usage_in_pages(si) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); return retval; } /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); i = 0; while (swap_usage_in_pages(si) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); folio = swap_cache_get_folio(entry); if (!folio) continue; /* * It is conceivable that a racing task removed this folio from * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But * that is okay, folio_free_swap() only removes stale folios. */ folio_lock(folio); folio_wait_writeback(folio); folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } /* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; * and even shmem_writeout() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR; } success: /* * Make sure that further cleanups after try_to_unuse() returns happen * after swap_range_free() reduces si->inuse_pages to 0. */ smp_mb(); return 0; } /* * After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be * added to the mmlist just after page_duplicate - before would be racy. */ static void drain_mmlist(void) { struct list_head *p, *next; unsigned int type; for (type = 0; type < nr_swapfiles; type++) if (swap_usage_in_pages(swap_info[type])) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) list_del_init(p); spin_unlock(&mmlist_lock); } /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis, struct file *swap_file) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); rb_erase(rb, &sis->swap_extent_root); kfree(se); } if (sis->flags & SWP_ACTIVATED) { struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate) mapping->a_ops->swap_deactivate(swap_file); } } /* * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * * This function rather assumes that it is called in ascending page order. */ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; /* * place the new node at the right most since the * function is called in ascending page order. */ while (*link) { parent = *link; link = &parent->rb_right; } if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ se->nr_pages += nr_pages; return 0; } } /* No merge, insert a new extent. */ new_se = kmalloc_obj(*se); if (new_se == NULL) return -ENOMEM; new_se->start_page = start_page; new_se->nr_pages = nr_pages; new_se->start_block = start_block; rb_link_node(&new_se->rb_node, parent, link); rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is * built at swapon time and is then used at swap_writepage/swap_read_folio * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. * This is done so that the main operating code can treat S_ISBLK and S_ISREG * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * * For all swap devices we set S_SWAPFILE across the life of the swapon. This * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; return ret; } if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { destroy_swap_extents(sis, swap_file); return -ENOMEM; } return ret; } return generic_swapfile_activate(sis, swap_file, span); } static void _enable_swap_info(struct swap_info_struct *si) { atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; assert_spin_locked(&swap_lock); plist_add(&si->list, &swap_active_head); /* Add back to available list */ add_to_avail_list(si, true); } /* * Called after the swap device is ready, resurrect its percpu ref, it's now * safe to reference it. Add it to the list to expose it to the allocator. */ static void enable_swap_info(struct swap_info_struct *si) { percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); spin_lock(&si->lock); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } /* * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range * see the updated flags, so there will be no more allocations. */ static void wait_for_allocation(struct swap_info_struct *si) { unsigned long offset; unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); struct swap_cluster_info *ci; BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { ci = swap_cluster_lock(si, offset); swap_cluster_unlock(ci); } } static void free_swap_cluster_info(struct swap_cluster_info *cluster_info, unsigned long maxpages) { struct swap_cluster_info *ci; int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); if (!cluster_info) return; for (i = 0; i < nr_clusters; i++) { ci = cluster_info + i; /* Cluster with bad marks count will have a remaining table */ spin_lock(&ci->lock); if (rcu_dereference_protected(ci->table, true)) { swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, true); swap_cluster_free_table(ci); } spin_unlock(&ci->lock); } kvfree(cluster_info); } /* * Called after swap device's reference count is dead, so * neither scan nor allocation will use it. */ static void flush_percpu_swap_cluster(struct swap_info_struct *si) { int cpu, i; struct swap_info_struct **pcp_si; for_each_possible_cpu(cpu) { pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu); /* * Invalidate the percpu swap cluster cache, si->users * is dead, so no new user will point to it, just flush * any existing user. */ for (i = 0; i < SWAP_NR_ORDERS; i++) cmpxchg(&pcp_si[i], si, NULL); } } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; unsigned int maxpages; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; BUG_ON(!current->mm); CLASS(filename, pathname)(specialfile); victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(victim)) return PTR_ERR(victim); mapping = victim->f_mapping; spin_lock(&swap_lock); plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; break; } } } if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; } if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; spin_unlock(&swap_lock); goto out_dput; } spin_lock(&p->lock); del_from_avail_list(p, true); plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; spin_unlock(&p->lock); spin_unlock(&swap_lock); wait_for_allocation(p); set_current_oom_origin(); err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); goto out_dput; } /* * Wait for swap operations protected by get/put_swap_device() * to complete. Because of synchronize_rcu() here, all swap * operations protected by RCU reader side lock (including any * spinlock) will be waited too. This makes it easy to * prevent folio_test_swapcache() and the following swap cache * operations from racing with swapoff. */ percpu_ref_kill(&p->users); synchronize_rcu(); wait_for_completion(&p->comp); flush_work(&p->discard_work); flush_work(&p->reclaim_work); flush_percpu_swap_cluster(p); destroy_swap_extents(p, p->swap_file); if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); drain_mmlist(); swap_file = p->swap_file; p->swap_file = NULL; zeromap = p->zeromap; p->zeromap = NULL; maxpages = p->max; cluster_info = p->cluster_info; p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster = NULL; kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); inode = mapping->host; inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; inode_unlock(inode); filp_close(swap_file, NULL); /* * Clear the SWP_USED flag after all resources are freed so that swapon * can reuse this swap_info in alloc_swap_info() safely. It is ok to * not hold p->lock after we cleared its SWP_WRITEOK. */ spin_lock(&swap_lock); p->flags = 0; spin_unlock(&swap_lock); err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); return err; } #ifdef CONFIG_PROC_FS static __poll_t swaps_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); if (seq->poll_event != atomic_read(&proc_poll_event)) { seq->poll_event = atomic_read(&proc_poll_event); return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } return EPOLLIN | EPOLLRDNORM; } /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { struct swap_info_struct *si; int type; loff_t l = *pos; mutex_lock(&swapon_mutex); if (!l) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_info(type)); type++) { if (!(si->swap_file)) continue; if (!--l) return si; } return NULL; } static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { struct swap_info_struct *si = v; int type; if (v == SEQ_START_TOKEN) type = 0; else type = si->type + 1; ++(*pos); for (; (si = swap_type_to_info(type)); type++) { if (!(si->swap_file)) continue; return si; } return NULL; } static void swap_stop(struct seq_file *swap, void *v) { mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) { struct swap_info_struct *si = v; struct file *file; int len; unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } bytes = K(si->pages); inuse = K(swap_usage_in_pages(si)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } static const struct seq_operations swaps_op = { .start = swap_start, .next = swap_next, .stop = swap_stop, .show = swap_show }; static int swaps_open(struct inode *inode, struct file *file) { struct seq_file *seq; int ret; ret = seq_open(file, &swaps_op); if (ret) return ret; seq = file->private_data; seq->poll_event = atomic_read(&proc_poll_event); return 0; } static const struct proc_ops swaps_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = swaps_poll, }; static int __init procswaps_init(void) { proc_create("swaps", 0, NULL, &swaps_proc_ops); return 0; } __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ #ifdef MAX_SWAPFILES_CHECK static int __init max_swapfiles_check(void) { MAX_SWAPFILES_CHECK(); return 0; } late_initcall(max_swapfiles_check); #endif static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; p = kvzalloc_obj(struct swap_info_struct); if (!p) return ERR_PTR(-ENOMEM); if (percpu_ref_init(&p->users, swap_users_ref_free, PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { kvfree(p); return ERR_PTR(-ENOMEM); } spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); percpu_ref_exit(&p->users); kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; /* * Publish the swap_info_struct after initializing it. * Note that kvzalloc() above zeroes all its fields. */ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ nr_swapfiles++; } else { defer = p; p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() * would be relying on p->type to remain valid. */ } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { percpu_ref_exit(&defer->users); kvfree(defer); } spin_lock_init(&p->lock); atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); return p; } static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) { if (S_ISBLK(inode->i_mode)) { si->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ if (bdev_is_zoned(si->bdev)) return -EINVAL; si->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { si->bdev = inode->i_sb->s_bdev; } return 0; } /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: * 1) the number of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte, as defined by the different * architectures. * * In order to find the largest possible bit mask, a swap entry with * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap offset is * extracted. * * This will mask all the bits from the initial ~0UL mask that can't * be encoded in either the swp_entry_t or the architecture definition * of a swap pte. */ unsigned long generic_max_swapfile_size(void) { swp_entry_t entry = swp_entry(0, ~0UL); const pte_t pte = softleaf_to_pte(entry); /* * Since the PTE can be an invalid softleaf entry (e.g. the none PTE), * we need to do this manually. */ entry = __pte_to_swp_entry(pte); entry = swp_entry(__swp_type(entry), __swp_offset(entry)); return swp_offset(entry) + 1; } /* Can be overridden by an architecture for additional checks. */ __weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } static unsigned long read_swap_header(struct swap_info_struct *si, union swap_header *swap_header, struct inode *inode) { int i; unsigned long maxpages; unsigned long swapfilepages; unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { pr_err("Unable to find swap-space signature\n"); return 0; } /* swap partition endianness hack... */ if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { pr_warn("Unable to handle swap header version %d\n", swap_header->info.version); return 0; } maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); return 0; } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", K(maxpages), K(last_page)); } if (maxpages > last_page) { maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } if (!maxpages) return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; return maxpages; } static int setup_swap_clusters_info(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; int err = -ENOMEM; unsigned long i; cluster_info = kvzalloc_objs(*cluster_info, nr_clusters); if (!cluster_info) goto err; for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc_obj(*si->global_cluster); if (!si->global_cluster) goto err; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock); } /* * Mark unusable pages (header page, bad pages, and the EOF part of * the last cluster) as unavailable. The clusters aren't marked free * yet, so no list operations are involved yet. */ err = swap_cluster_setup_bad_slot(si, cluster_info, 0, false); if (err) goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (!page_nr || page_nr > swap_header->info.last_page) { pr_warn("Bad slot offset is out of border: %d (last_page: %d)\n", page_nr, swap_header->info.last_page); err = -EINVAL; goto err; } err = swap_cluster_setup_bad_slot(si, cluster_info, page_nr, false); if (err) goto err; } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { err = swap_cluster_setup_bad_slot(si, cluster_info, i, true); if (err) goto err; } INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); INIT_LIST_HEAD(&si->discard_clusters); for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); } for (i = 0; i < nr_clusters; i++) { struct swap_cluster_info *ci = &cluster_info[i]; if (ci->count) { ci->flags = CLUSTER_FLAG_NONFULL; list_add_tail(&ci->list, &si->nonfull_clusters[0]); } else { ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } } si->cluster_info = cluster_info; return 0; err: free_swap_cluster_info(cluster_info, maxpages); return err; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *si; struct file *swap_file = NULL; struct address_space *mapping; struct dentry *dentry; int prio; int error; union swap_header *swap_header; int nr_extents; sector_t span; unsigned long maxpages; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * Allocate or reuse existing !SWP_USED swap_info. The returned * si will stay in a dying status, so nothing will access its content * until enable_swap_info resurrects its percpu ref and expose it. */ si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); INIT_WORK(&si->discard_work, swap_discard_work); INIT_WORK(&si->reclaim_work, swap_reclaim_work); CLASS(filename, name)(specialfile); swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; goto bad_swap; } mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(si, inode); if (unlikely(error)) goto bad_swap; inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; } /* * The swap subsystem needs a major overhaul to support this. * It doesn't work yet so just disable it for now. */ if (mapping_min_folio_order(mapping) > 0) { error = -EINVAL; goto bad_swap_unlock_inode; } /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } folio = read_mapping_folio(mapping, 0, swap_file); if (IS_ERR(folio)) { error = PTR_ERR(folio); goto bad_swap_unlock_inode; } swap_header = kmap_local_folio(folio, 0); maxpages = read_swap_header(si, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; } si->max = maxpages; si->pages = maxpages - 1; nr_extents = setup_swap_extents(si, swap_file, &span); if (nr_extents < 0) { error = nr_extents; goto bad_swap_unlock_inode; } if (si->pages != si->max - 1) { pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max); error = -EINVAL; goto bad_swap_unlock_inode; } maxpages = si->max; /* Set up the swap cluster info */ error = setup_swap_clusters_info(si, swap_header, maxpages); if (error) goto bad_swap_unlock_inode; error = swap_cgroup_swapon(si->type, maxpages); if (error) goto bad_swap_unlock_inode; /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!si->zeromap) { error = -ENOMEM; goto bad_swap_unlock_inode; } if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |= SWP_STABLE_WRITES; if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* * By flagging sys_swapon, a sysadmin can tell us to * either do single-time area discards only, or to just * perform discards for released swap page-clusters. * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) si->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) si->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ if (si->flags & SWP_AREA_DISCARD) { int err = discard_swap(si); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", si, err); } } error = zswap_swapon(si->type, maxpages); if (error) goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this * swap device. */ inode->i_flags |= S_SWAPFILE; error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; goto free_swap_zswap; } mutex_lock(&swapon_mutex); prio = DEF_SWAP_PRIO; if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; /* * The plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ si->prio = prio; si->list.prio = -si->prio; si->avail_list.prio = -si->prio; si->swap_file = swap_file; /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */ enable_swap_info(si); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, K((unsigned long long)span), (si->flags & SWP_SOLIDSTATE) ? "SS" : "", (si->flags & SWP_DISCARDABLE) ? "D" : "", (si->flags & SWP_AREA_DISCARD) ? "s" : "", (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); error = 0; goto out; free_swap_zswap: zswap_swapoff(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: kfree(si->global_cluster); si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si, swap_file); swap_cgroup_swapoff(si->type); free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; kvfree(si->zeromap); si->zeromap = NULL; /* * Clear the SWP_USED flag after all resources are freed so * alloc_swap_info can reuse this si safely. */ spin_lock(&swap_lock); si->flags = 0; spin_unlock(&swap_lock); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: if (!IS_ERR_OR_NULL(folio)) folio_release_kmap(folio, swap_header); if (inode) inode_unlock(inode); return error; } void si_swapinfo(struct sysinfo *val) { unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += swap_usage_in_pages(si); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } /* * swap_dup_entry_direct() - Increase reference count of a swap entry by one. * @entry: first swap entry from which we want to increase the refcount. * * Returns 0 for success, or -ENOMEM if the extend table is required * but could not be atomically allocated. Returns -EINVAL if the swap * entry is invalid, which might occur if a page table entry has got * corrupted. * * Context: Caller must ensure there is no race condition on the reference * owner. e.g., locking the PTL of a PTE containing the entry being increased. * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should * be used. */ int swap_dup_entry_direct(swp_entry_t entry) { struct swap_info_struct *si; si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { pr_err("%s%08lx\n", Bad_file, entry.val); return -EINVAL; } /* * The caller must be increasing the swap count from a direct * reference of the swap slot (e.g. a swap entry in page table). * So the swap count must be >= 1. */ VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry)); return swap_dup_entries_cluster(si, swp_offset(entry), 1); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) static bool __has_usable_swap(void) { return !plist_head_empty(&swap_active_head); } void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si; if (!(gfp & __GFP_IO)) return; if (!__has_usable_swap()) return; if (!blk_cgroup_congested()) return; /* * We've already scheduled a throttle, avoid taking the global swap * lock. */ if (current->throttle_disk) return; spin_lock(&swap_avail_lock); plist_for_each_entry(si, &swap_avail_head, avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } spin_unlock(&swap_avail_lock); } #endif static int __init swapfile_init(void) { swapfile_maximum_size = arch_max_swapfile_size(); /* * Once a cluster is freed, it's swap table content is read * only, and all swap cache readers (swap_cache_*) verifies * the content before use. So it's safe to use RCU slab here. */ if (!SWP_TABLE_USE_PAGE) swap_table_cachep = kmem_cache_create("swap_table", sizeof(struct swap_table), 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; #endif /* CONFIG_MIGRATION */ return 0; } subsys_initcall(swapfile_init); |
| 19 4 12 5 5 5 2 17 20 1 1 1 17 14 9 5 5 4 4 1 5 5 34 33 35 8 27 1 1 30 31 20 2 2 17 2 17 1 2 2 4 14 17 5 10 2 5 1 6 11 12 12 2 2 20 2 1 1 1 4 12 4 11 1 1 7 1 1 1 1 3 11 10 5 5 1 3 4 2 4 9 3 3 4 12 12 11 5 6 10 5 5 11 10 1 10 8 8 8 35 25 25 27 23 32 32 3 12 15 4 5 5 3 1 1 2 18 18 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 14 1 1 1 2 1 2 1 1 1 1 1 1 1 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 | /* * af_llc.c - LLC User Interface SAPs * Description: * Functions in this module are implementation of socket based llc * communications for the Linux operating system. Support of llc class * one and class two is provided via SOCK_DGRAM and SOCK_STREAM * respectively. * * An llc2 connection is (mac + sap), only one llc2 sap connection * is allowed per mac. Though one sap may have multiple mac + sap * connections. * * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org> * 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_pdu.h> #include <net/llc_conn.h> #include <net/tcp_states.h> /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; static bool llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) do {} while (0) #endif /* Maybe we'll add some more in the future. */ #define LLC_CMSG_PKTINFO 1 /** * llc_ui_next_link_no - return the next unused link number for a sap * @sap: Address of sap to get link number from. * * Return the next unused link number for a given sap. */ static inline u16 llc_ui_next_link_no(int sap) { return llc_ui_sap_link_no_max[sap]++; } /** * llc_proto_type - return eth protocol for ARP header type * @arphrd: ARP header type. * * Given an ARP header type return the corresponding ethernet protocol. */ static inline __be16 llc_proto_type(u16 arphrd) { return htons(ETH_P_802_2); } /** * llc_ui_addr_null - determines if a address structure is null * @addr: Address to test if null. */ static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr) { return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr)); } /** * llc_ui_header_len - return length of llc header based on operation * @sk: Socket which contains a valid llc socket type. * @addr: Complete sockaddr_llc structure received from the user. * * Provide the length of the llc header depending on what kind of * operation the user would like to perform and the type of socket. * Returns the correct llc header length. */ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; if (addr->sllc_test) rc = LLC_PDU_LEN_U; else if (addr->sllc_xid) /* We need to expand header to sizeof(struct llc_xid_info) * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header * as XID PDU. In llc_ui_sendmsg() we reserved header size and then * filled all other space with user data. If we won't reserve this * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data */ rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; } /** * llc_ui_send_data - send data via reliable llc2 connection * @sk: Connection the socket is using. * @skb: Data the user wishes to send. * @noblock: can we block waiting for data? * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. * * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); if (rc) { kfree_skb(skb); return rc; } } return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) { sock_graft(sk, sock); sk->sk_type = sock->type; sock->ops = &llc_ui_ops; } static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** * llc_ui_create - alloc and init a new llc_ui socket * @net: network namespace (must be default network) * @sock: Socket to initialize and attach allocated sk to. * @protocol: Unused. * @kern: on behalf of kernel or userspace * * Allocate and initialize a new llc_ui socket, validate the user wants a * socket type we have available. * Returns 0 upon success, negative upon failure. */ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc = -ESOCKTNOSUPPORT; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); } } return rc; } /** * llc_ui_release - shutdown socket * @sock: Socket to release. * * Shutdown and deallocate an existing socket. */ static int llc_ui_release(struct socket *sock) { struct sock *sk = sock->sk; struct llc_sock *llc; if (unlikely(sk == NULL)) goto out; sock_hold(sk); lock_sock(sk); llc = llc_sk(sk); dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); if (!sock_flag(sk, SOCK_ZAPPED)) { struct llc_sap *sap = llc->sap; /* Hold this for release_sock(), so that llc_backlog_rcv() * could still use it. */ llc_sap_hold(sap); llc_sap_remove_socket(llc->sap, sk); release_sock(sk); llc_sap_put(sap); } else { release_sock(sk); } netdev_put(llc->dev, &llc->dev_tracker); sock_put(sk); sock_orphan(sk); sock->sk = NULL; llc_sk_free(sk); out: return 0; } /** * llc_ui_autoport - provide dynamically allocate SAP number * * Provide the caller with a dynamically allocated SAP number according * to the rules that are set in this function. Returns: 0, upon failure, * SAP number otherwise. */ static int llc_ui_autoport(void) { struct llc_sap *sap; int i, tries = 0; while (tries < LLC_SAP_DYN_TRIES) { for (i = llc_ui_sap_last_autoport; i < LLC_SAP_DYN_STOP; i += 2) { sap = llc_sap_find(i); if (!sap) { llc_ui_sap_last_autoport = i + 2; goto out; } llc_sap_put(sap); } llc_ui_sap_last_autoport = LLC_SAP_DYN_START; tries++; } i = 0; out: return i; } /** * llc_ui_autobind - automatically bind a socket to a sap * @sock: socket to bind * @addr: address to connect to * * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't * specifically used llc_ui_bind to bind to an specific address/sap * * Returns: 0 upon success, negative otherwise. */ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (addr->sllc_arphrd != ARPHRD_ETHER) goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); if (dev && addr->sllc_arphrd != dev->type) { dev_put(dev); dev = NULL; } } else dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); if (!dev) goto out; rc = -EUSERS; llc->laddr.lsap = llc_ui_autoport(); if (!llc->laddr.lsap) goto out; rc = -EBUSY; /* some other network layer is using the sap */ sap = llc_sap_open(llc->laddr.lsap, NULL); if (!sap) goto out; /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: dev_put(dev); return rc; } /** * llc_ui_bind - bind a socket to a specific address. * @sock: Socket to bind an address to. * @uaddr: Address the user wants the socket bound to. * @addrlen: Length of the uaddr structure. * * Bind a socket to a specific address. For llc a user is able to bind to * a specific sap only or mac + sap. * If the user desires to bind to a specific mac + sap, it is possible to * have multiple sap connections via multiple macs. * Bind and autobind for that matter must enforce the correct sap usage * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER)) goto out; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (dev) { if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, dev->dev_addr, IFHWADDRLEN); if (addr->sllc_arphrd != dev->type || !ether_addr_equal(addr->sllc_mac, dev->dev_addr)) { rc = -EINVAL; dev = NULL; } } } else { dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); } dev_hold(dev); rcu_read_unlock(); if (!dev) goto out; if (!addr->sllc_sap) { rc = -EUSERS; addr->sllc_sap = llc_ui_autoport(); if (!addr->sllc_sap) goto out; } sap = llc_sap_find(addr->sllc_sap); if (!sap) { sap = llc_sap_open(addr->sllc_sap, NULL); rc = -EBUSY; /* some other network layer is using the sap */ if (!sap) goto out; } else { struct llc_addr laddr, daddr; struct sock *ask; memset(&laddr, 0, sizeof(laddr)); memset(&daddr, 0, sizeof(daddr)); /* * FIXME: check if the address is multicast, * only SOCK_DGRAM can do this. */ memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ ask = llc_lookup_established(sap, &daddr, &laddr, &init_net); if (ask) { sock_put(ask); goto out_put; } } /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; llc->laddr.lsap = addr->sllc_sap; memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out_put: llc_sap_put(sap); out: dev_put(dev); release_sock(sk); return rc; } /** * llc_ui_shutdown - shutdown a connect llc2 socket. * @sock: Socket to shutdown. * @how: What part of the socket to shutdown. * * Shutdown a connected llc2 socket. Currently this function only supports * shutting down both sends and receives (2), we could probably make this * function such that a user can shutdown only half the connection but not * right now. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int rc = -ENOTCONN; lock_sock(sk); if (unlikely(sk->sk_state != TCP_ESTABLISHED)) goto out; rc = -EINVAL; if (how != 2) goto out; rc = llc_send_disc(sk); if (!rc) rc = llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); out: release_sock(sk); return rc; } /** * llc_ui_connect - Connect to a remote llc2 mac + sap. * @sock: Socket which will be connected to the remote destination. * @uaddr: Remote and possibly the local address of the new connection. * @addrlen: Size of uaddr structure. * @flags: Operational flags specified by the user. * * Connect to a remote llc2 mac + sap. The caller must specify the * destination mac and address to connect to. If the user hasn't previously * called bind(2) with a smac the address of the first interface of the * specified arp type will be used. * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; int rc = -EINVAL; lock_sock(sk); if (unlikely(addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (unlikely(addr->sllc_family != AF_LLC)) goto out; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EALREADY; if (unlikely(sock->state == SS_CONNECTING)) goto out; /* bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } llc->daddr.lsap = addr->sllc_sap; memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); rc = llc_establish_connection(sk, llc->dev->dev_addr, addr->sllc_mac, addr->sllc_sap); if (rc) { dprintk("%s: llc_ui_send_conn failed :-(\n", __func__); sock->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; goto out; } if (sk->sk_state == TCP_SYN_SENT) { const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) goto out; rc = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } if (sk->sk_state == TCP_CLOSE) goto sock_error; sock->state = SS_CONNECTED; rc = 0; out: release_sock(sk); return rc; sock_error: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; } /** * llc_ui_listen - allow a normal socket to accept incoming connections * @sock: Socket to allow incoming connections on. * @backlog: Number of connections to queue. * * Allow a normal socket to accept incoming connections. * Returns 0 upon success, negative otherwise. */ static int llc_ui_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EINVAL; lock_sock(sk); if (unlikely(sock->state != SS_UNCONNECTED)) goto out; rc = -EOPNOTSUPP; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EAGAIN; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = 0; if (!(unsigned int)backlog) /* BSDism */ backlog = 1; sk->sk_max_ack_backlog = backlog; if (sk->sk_state != TCP_LISTEN) { sk->sk_ack_backlog = 0; sk->sk_state = TCP_LISTEN; } sk->sk_socket->flags |= __SO_ACCEPTCON; out: release_sock(sk); return rc; } static int llc_ui_wait_for_disc(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return timeout; } static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct llc_sock *llc = llc_sk(sk); int rc; add_wait_queue(sk_sleep(sk), &wait); while (1) { rc = 0; if (sk_wait_event(sk, &timeout, (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int llc_wait_data(struct sock *sk, long timeo) { int rc; while (1) { /* * POSIX 1003.1g mandates this order. */ rc = sock_error(sk); if (rc) break; rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -EAGAIN; if (!timeo) break; rc = sock_intr_errno(timeo); if (signal_pending(current)) break; rc = 0; if (sk_wait_data(sk, &timeo, NULL)) break; } return rc; } static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(skb->sk); if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { struct llc_pktinfo info; memset(&info, 0, sizeof(info)); info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; llc_pdu_decode_dsap(skb, &info.lpi_sap); llc_pdu_decode_da(skb, info.lpi_mac); put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); } } /** * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. * @arg: User specified arguments * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ static int llc_ui_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; struct sk_buff *skb; int rc = -EOPNOTSUPP; dprintk("%s: accepting on %02X\n", __func__, llc_sk(sk)->laddr.lsap); lock_sock(sk); if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EINVAL; if (unlikely(sock->state != SS_UNCONNECTED || sk->sk_state != TCP_LISTEN)) goto out; /* wait for a connection to arrive. */ if (skb_queue_empty(&sk->sk_receive_queue)) { rc = llc_wait_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out; } dprintk("%s: got a new connection on %02X\n", __func__, llc_sk(sk)->laddr.lsap); skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto frees; rc = 0; newsk = skb->sk; /* attach connection to a new socket. */ llc_ui_sk_init(newsock, newsk); sock_reset_flag(newsk, SOCK_ZAPPED); newsk->sk_state = TCP_ESTABLISHED; newsock->state = SS_CONNECTED; llc = llc_sk(sk); newllc = llc_sk(newsk); memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: kfree_skb(skb); out: release_sock(sk); return rc; } /** * llc_ui_recvmsg - copy received data to the socket user. * @sock: Socket to copy data from. * @msg: Various user space related information. * @len: Size of user buffer. * @flags: User specified flags. * * Copy received data to the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name); const int nonblock = flags & MSG_DONTWAIT; struct sk_buff *skb = NULL; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); size_t copied = 0; u32 peek_seq = 0; u32 *seq, skb_len; unsigned long used; int target; /* Read at least this many bytes */ long timeo; lock_sock(sk); copied = -ENOTCONN; if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); seq = &llc->copied_seq; if (flags & MSG_PEEK) { peek_seq = llc->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); copied = 0; do { u32 offset; /* * We need to check signals first, to get correct SIGURG * handling. FIXME: Need to check this doesn't impact 1003.1g * and move it down to the bottom of the loop */ if (signal_pending(current)) { if (copied) break; copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } /* Next get a buffer. */ skb = skb_peek(&sk->sk_receive_queue); if (skb) { offset = *seq; goto found_ok_skb; } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || (flags & MSG_PEEK)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } break; } if (!timeo) { copied = -EAGAIN; break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); } else sk_wait_data(sk, &timeo, NULL); if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = llc->copied_seq; } continue; found_ok_skb: skb_len = skb->len; /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; if (!(flags & MSG_TRUNC)) { int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } *seq += used; copied += used; len -= used; /* For non stream protcols we get one packet per recvmsg call */ if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; /* Partial read */ if (used + offset < skb_len) continue; if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } } while (len > 0); out: release_sock(sk); return copied; copy_uaddr: if (uaddr != NULL && skb != NULL) { memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); } if (llc_sk(sk)->cmsg_flags) llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } goto out; } /** * llc_ui_sendmsg - Transmit data provided by the socket user. * @sock: Socket to transmit data from. * @msg: Various user related information. * @len: Length of data to transmit. * * Transmit data provided by the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; int rc = -EINVAL, copied = 0, hdrlen, hh_len; struct sk_buff *skb = NULL; struct net_device *dev; size_t size = 0; dprintk("%s: sending from %02X to %02X\n", __func__, llc->laddr.lsap, llc->daddr.lsap); lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) goto out; } else { if (llc_ui_addr_null(&llc->addr)) goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } dev = llc->dev; hh_len = LL_RESERVED_SPACE(dev); hdrlen = llc_ui_header_len(sk, addr); size = hdrlen + len; size = min_t(size_t, size, READ_ONCE(dev->mtu)); copied = size - hdrlen; rc = -EINVAL; if (copied < 0) goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); lock_sock(sk); if (!skb) goto out; if (sock_flag(sk, SOCK_ZAPPED) || llc->dev != dev || hdrlen != llc_ui_header_len(sk, addr) || hh_len != LL_RESERVED_SPACE(dev) || size > READ_ONCE(dev->mtu)) goto out; skb->dev = dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hh_len + hdrlen); rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); skb = NULL; out: kfree_skb(skb); if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); release_sock(sk); return rc ? : copied; } /** * llc_ui_getname - return the address info of a socket * @sock: Socket to get address of. * @uaddr: Address structure to return information. * @peer: Does user want local or remote address information. * * Return the address information of a socket. */ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; if(llc->dev) sllc.sllc_arphrd = llc->dev->type; sllc.sllc_sap = llc->daddr.lsap; memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); } else { rc = -EINVAL; if (!llc->sap) goto out; sllc.sllc_sap = llc->sap->laddr.lsap; if (llc->dev) { sllc.sllc_arphrd = llc->dev->type; memcpy(&sllc.sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); } } sllc.sllc_family = AF_LLC; memcpy(uaddr, &sllc, sizeof(sllc)); rc = sizeof(sllc); out: release_sock(sk); return rc; } /** * llc_ui_ioctl - io controls for PF_LLC * @sock: Socket to get/set info * @cmd: command * @arg: optional argument for cmd * * get/set info on llc sockets */ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -ENOIOCTLCMD; } /** * llc_ui_setsockopt - set various connection specific parameters. * @sock: Socket to set options on. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: User provided operation data. * @optlen: Length of optval. * * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); unsigned int opt; int rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; rc = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (rc) goto out; rc = -EINVAL; switch (optname) { case LLC_OPT_RETRY: if (opt > LLC_OPT_MAX_RETRY) goto out; llc->n2 = opt; break; case LLC_OPT_SIZE: if (opt > LLC_OPT_MAX_SIZE) goto out; llc->n1 = opt; break; case LLC_OPT_ACK_TMR_EXP: if (opt > LLC_OPT_MAX_ACK_TMR_EXP) goto out; llc->ack_timer.expire = opt * HZ; break; case LLC_OPT_P_TMR_EXP: if (opt > LLC_OPT_MAX_P_TMR_EXP) goto out; llc->pf_cycle_timer.expire = opt * HZ; break; case LLC_OPT_REJ_TMR_EXP: if (opt > LLC_OPT_MAX_REJ_TMR_EXP) goto out; llc->rej_sent_timer.expire = opt * HZ; break; case LLC_OPT_BUSY_TMR_EXP: if (opt > LLC_OPT_MAX_BUSY_TMR_EXP) goto out; llc->busy_state_timer.expire = opt * HZ; break; case LLC_OPT_TX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->k = opt; break; case LLC_OPT_RX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->rw = opt; break; case LLC_OPT_PKTINFO: if (opt) llc->cmsg_flags |= LLC_CMSG_PKTINFO; else llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; out: release_sock(sk); return rc; } /** * llc_ui_getsockopt - get connection specific socket info * @sock: Socket to get information from. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: Variable to return operation data in. * @optlen: Length of optval. * * Get connection specific socket information. */ static int llc_ui_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int val = 0, len = 0, rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC)) goto out; rc = get_user(len, optlen); if (rc) goto out; rc = -EINVAL; if (len != sizeof(int)) goto out; switch (optname) { case LLC_OPT_RETRY: val = llc->n2; break; case LLC_OPT_SIZE: val = llc->n1; break; case LLC_OPT_ACK_TMR_EXP: val = llc->ack_timer.expire / HZ; break; case LLC_OPT_P_TMR_EXP: val = llc->pf_cycle_timer.expire / HZ; break; case LLC_OPT_REJ_TMR_EXP: val = llc->rej_sent_timer.expire / HZ; break; case LLC_OPT_BUSY_TMR_EXP: val = llc->busy_state_timer.expire / HZ; break; case LLC_OPT_TX_WIN: val = llc->k; break; case LLC_OPT_RX_WIN: val = llc->rw; break; case LLC_OPT_PKTINFO: val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; if (put_user(len, optlen) || copy_to_user(optval, &val, len)) rc = -EFAULT; out: release_sock(sk); return rc; } static const struct net_proto_family llc_ui_family_ops = { .family = PF_LLC, .create = llc_ui_create, .owner = THIS_MODULE, }; static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, .bind = llc_ui_bind, .connect = llc_ui_connect, .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, .setsockopt = llc_ui_setsockopt, .getsockopt = llc_ui_getsockopt, .sendmsg = llc_ui_sendmsg, .recvmsg = llc_ui_recvmsg, .mmap = sock_no_mmap, }; static const char llc_proc_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; static const char llc_sysctl_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the sysctl entries\n"; static const char llc_sock_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the network family\n"; static int __init llc2_init(void) { int rc = proto_register(&llc_proto, 0); if (rc != 0) goto out; llc_build_offset_table(); llc_station_init(); llc_ui_sap_last_autoport = LLC_SAP_DYN_START; rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); goto out_station; } rc = llc_sysctl_init(); if (rc) { printk(llc_sysctl_err_msg); goto out_proc; } rc = sock_register(&llc_ui_family_ops); if (rc) { printk(llc_sock_err_msg); goto out_sysctl; } llc_add_pack(LLC_DEST_SAP, llc_sap_handler); llc_add_pack(LLC_DEST_CONN, llc_conn_handler); out: return rc; out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); out_station: llc_station_exit(); proto_unregister(&llc_proto); goto out; } static void __exit llc2_exit(void) { llc_station_exit(); llc_remove_pack(LLC_DEST_SAP); llc_remove_pack(LLC_DEST_CONN); sock_unregister(PF_LLC); llc_proc_exit(); llc_sysctl_exit(); proto_unregister(&llc_proto); } module_init(llc2_init); module_exit(llc2_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support"); MODULE_ALIAS_NETPROTO(PF_LLC); |
| 17 17 12 12 6 6 3 3 6 10 5 4 1 3 2 1 1 3 2 1 14 6 16 14 14 7 1 1 1 3 2 12 14 13 1 7 4 3 7 1 1 2 1 1 1 1 1 29 30 1 2 27 1 6 14 20 7 26 31 1 30 5 1 2 1 2 2 2 8 1 2 7 1 7 7 7 1 7 7 7 7 7 1 1 1 7 1 2 4 1 3 3 2 1 2 2 2 5 2 1 3 3 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/pci.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/skbuff.h> #include "vmci_handle_array.h" #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_resource.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" #include "vmci_route.h" /* * In the following, we will distinguish between two kinds of VMX processes - * the ones with versions lower than VMCI_VERSION_NOVMVM that use specialized * VMCI page files in the VMX and supporting VM to VM communication and the * newer ones that use the guest memory directly. We will in the following * refer to the older VMX versions as old-style VMX'en, and the newer ones as * new-style VMX'en. * * The state transition datagram is as follows (the VMCIQPB_ prefix has been * removed for readability) - see below for more details on the transtions: * * -------------- NEW ------------- * | | * \_/ \_/ * CREATED_NO_MEM <-----------------> CREATED_MEM * | | | * | o-----------------------o | * | | | * \_/ \_/ \_/ * ATTACHED_NO_MEM <----------------> ATTACHED_MEM * | | | * | o----------------------o | * | | | * \_/ \_/ \_/ * SHUTDOWN_NO_MEM <----------------> SHUTDOWN_MEM * | | * | | * -------------> gone <------------- * * In more detail. When a VMCI queue pair is first created, it will be in the * VMCIQPB_NEW state. It will then move into one of the following states: * * - VMCIQPB_CREATED_NO_MEM: this state indicates that either: * * - the created was performed by a host endpoint, in which case there is * no backing memory yet. * * - the create was initiated by an old-style VMX, that uses * vmci_qp_broker_set_page_store to specify the UVAs of the queue pair at * a later point in time. This state can be distinguished from the one * above by the context ID of the creator. A host side is not allowed to * attach until the page store has been set. * * - VMCIQPB_CREATED_MEM: this state is the result when the queue pair * is created by a VMX using the queue pair device backend that * sets the UVAs of the queue pair immediately and stores the * information for later attachers. At this point, it is ready for * the host side to attach to it. * * Once the queue pair is in one of the created states (with the exception of * the case mentioned for older VMX'en above), it is possible to attach to the * queue pair. Again we have two new states possible: * * - VMCIQPB_ATTACHED_MEM: this state can be reached through the following * paths: * * - from VMCIQPB_CREATED_NO_MEM when a new-style VMX allocates a queue * pair, and attaches to a queue pair previously created by the host side. * * - from VMCIQPB_CREATED_MEM when the host side attaches to a queue pair * already created by a guest. * * - from VMCIQPB_ATTACHED_NO_MEM, when an old-style VMX calls * vmci_qp_broker_set_page_store (see below). * * - VMCIQPB_ATTACHED_NO_MEM: If the queue pair already was in the * VMCIQPB_CREATED_NO_MEM due to a host side create, an old-style VMX will * bring the queue pair into this state. Once vmci_qp_broker_set_page_store * is called to register the user memory, the VMCIQPB_ATTACH_MEM state * will be entered. * * From the attached queue pair, the queue pair can enter the shutdown states * when either side of the queue pair detaches. If the guest side detaches * first, the queue pair will enter the VMCIQPB_SHUTDOWN_NO_MEM state, where * the content of the queue pair will no longer be available. If the host * side detaches first, the queue pair will either enter the * VMCIQPB_SHUTDOWN_MEM, if the guest memory is currently mapped, or * VMCIQPB_SHUTDOWN_NO_MEM, if the guest memory is not mapped * (e.g., the host detaches while a guest is stunned). * * New-style VMX'en will also unmap guest memory, if the guest is * quiesced, e.g., during a snapshot operation. In that case, the guest * memory will no longer be available, and the queue pair will transition from * *_MEM state to a *_NO_MEM state. The VMX may later map the memory once more, * in which case the queue pair will transition from the *_NO_MEM state at that * point back to the *_MEM state. Note that the *_NO_MEM state may have changed, * since the peer may have either attached or detached in the meantime. The * values are laid out such that ++ on a state will move from a *_NO_MEM to a * *_MEM state, and vice versa. */ /* The Kernel specific component of the struct vmci_queue structure. */ struct vmci_queue_kern_if { struct mutex __mutex; /* Protects the queue. */ struct mutex *mutex; /* Shared by producer and consumer queues. */ size_t num_pages; /* Number of pages incl. header. */ bool host; /* Host or guest? */ union { struct { dma_addr_t *pas; void **vas; } g; /* Used by the guest. */ struct { struct page **page; struct page **header_page; } h; /* Used by the host. */ } u; }; /* * This structure is opaque to the clients. */ struct vmci_qp { struct vmci_handle handle; struct vmci_queue *produce_q; struct vmci_queue *consume_q; u64 produce_q_size; u64 consume_q_size; u32 peer; u32 flags; u32 priv_flags; bool guest_endpoint; unsigned int blocked; unsigned int generation; wait_queue_head_t event; }; enum qp_broker_state { VMCIQPB_NEW, VMCIQPB_CREATED_NO_MEM, VMCIQPB_CREATED_MEM, VMCIQPB_ATTACHED_NO_MEM, VMCIQPB_ATTACHED_MEM, VMCIQPB_SHUTDOWN_NO_MEM, VMCIQPB_SHUTDOWN_MEM, VMCIQPB_GONE }; #define QPBROKERSTATE_HAS_MEM(_qpb) (_qpb->state == VMCIQPB_CREATED_MEM || \ _qpb->state == VMCIQPB_ATTACHED_MEM || \ _qpb->state == VMCIQPB_SHUTDOWN_MEM) /* * In the queue pair broker, we always use the guest point of view for * the produce and consume queue values and references, e.g., the * produce queue size stored is the guests produce queue size. The * host endpoint will need to swap these around. The only exception is * the local queue pairs on the host, in which case the host endpoint * that creates the queue pair will have the right orientation, and * the attaching host endpoint will need to swap. */ struct qp_entry { struct list_head list_item; struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u32 ref_count; }; struct qp_broker_entry { struct vmci_resource resource; struct qp_entry qp; u32 create_id; u32 attach_id; enum qp_broker_state state; bool require_trusted_attach; bool created_by_trusted; bool vmci_page_files; /* Created by VMX using VMCI page files */ struct vmci_queue *produce_q; struct vmci_queue *consume_q; struct vmci_queue_header saved_produce_q; struct vmci_queue_header saved_consume_q; vmci_event_release_cb wakeup_cb; void *client_data; void *local_mem; /* Kernel memory for local queue pair */ }; struct qp_guest_endpoint { struct vmci_resource resource; struct qp_entry qp; u64 num_ppns; void *produce_q; void *consume_q; struct ppn_set ppn_set; }; struct qp_list { struct list_head head; struct mutex mutex; /* Protect queue list. */ }; static struct qp_list qp_broker_list = { .head = LIST_HEAD_INIT(qp_broker_list.head), .mutex = __MUTEX_INITIALIZER(qp_broker_list.mutex), }; static struct qp_list qp_guest_endpoints = { .head = LIST_HEAD_INIT(qp_guest_endpoints.head), .mutex = __MUTEX_INITIALIZER(qp_guest_endpoints.mutex), }; #define INVALID_VMCI_GUEST_MEM_ID 0 #define QPE_NUM_PAGES(_QPE) ((u32) \ (DIV_ROUND_UP(_QPE.produce_size, PAGE_SIZE) + \ DIV_ROUND_UP(_QPE.consume_size, PAGE_SIZE) + 2)) #define QP_SIZES_ARE_VALID(_prod_qsize, _cons_qsize) \ ((_prod_qsize) + (_cons_qsize) >= max(_prod_qsize, _cons_qsize) && \ (_prod_qsize) + (_cons_qsize) <= VMCI_MAX_GUEST_QP_MEMORY) /* * Frees kernel VA space for a given queue and its queue header, and * frees physical data pages. */ static void qp_free_queue(void *q, u64 size) { struct vmci_queue *queue = q; if (queue) { u64 i; /* Given size does not include header, so add in a page here. */ for (i = 0; i < DIV_ROUND_UP(size, PAGE_SIZE) + 1; i++) { dma_free_coherent(&vmci_pdev->dev, PAGE_SIZE, queue->kernel_if->u.g.vas[i], queue->kernel_if->u.g.pas[i]); } vfree(queue); } } /* * Allocates kernel queue pages of specified size with IOMMU mappings, * plus space for the queue structure/kernel interface and the queue * header. */ static void *qp_alloc_queue(u64 size, u32 flags) { u64 i; struct vmci_queue *queue; size_t pas_size; size_t vas_size; size_t queue_size = sizeof(*queue) + sizeof(*queue->kernel_if); u64 num_pages; if (size > SIZE_MAX - PAGE_SIZE) return NULL; num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / (sizeof(*queue->kernel_if->u.g.pas) + sizeof(*queue->kernel_if->u.g.vas))) return NULL; pas_size = num_pages * sizeof(*queue->kernel_if->u.g.pas); vas_size = num_pages * sizeof(*queue->kernel_if->u.g.vas); queue_size += pas_size + vas_size; queue = vmalloc(queue_size); if (!queue) return NULL; queue->q_header = NULL; queue->saved_header = NULL; queue->kernel_if = (struct vmci_queue_kern_if *)(queue + 1); queue->kernel_if->mutex = NULL; queue->kernel_if->num_pages = num_pages; queue->kernel_if->u.g.pas = (dma_addr_t *)(queue->kernel_if + 1); queue->kernel_if->u.g.vas = (void **)((u8 *)queue->kernel_if->u.g.pas + pas_size); queue->kernel_if->host = false; for (i = 0; i < num_pages; i++) { queue->kernel_if->u.g.vas[i] = dma_alloc_coherent(&vmci_pdev->dev, PAGE_SIZE, &queue->kernel_if->u.g.pas[i], GFP_KERNEL); if (!queue->kernel_if->u.g.vas[i]) { /* Size excl. the header. */ qp_free_queue(queue, i * PAGE_SIZE); return NULL; } } /* Queue header is the first page. */ queue->q_header = queue->kernel_if->u.g.vas[0]; return queue; } /* * Copies from a given buffer or iovector to a VMCI Queue. Uses * kmap_local_page() to dynamically map required portions of the queue * by traversing the offset -> page translation structure for the queue. * Assumes that offset + size does not wrap around in the queue. */ static int qp_memcpy_to_queue_iter(struct vmci_queue *queue, u64 queue_offset, struct iov_iter *from, size_t size) { struct vmci_queue_kern_if *kernel_if = queue->kernel_if; size_t bytes_copied = 0; while (bytes_copied < size) { const u64 page_index = (queue_offset + bytes_copied) / PAGE_SIZE; const size_t page_offset = (queue_offset + bytes_copied) & (PAGE_SIZE - 1); void *va; size_t to_copy; if (kernel_if->host) va = kmap_local_page(kernel_if->u.h.page[page_index]); else va = kernel_if->u.g.vas[page_index + 1]; /* Skip header. */ if (size - bytes_copied > PAGE_SIZE - page_offset) /* Enough payload to fill up from this page. */ to_copy = PAGE_SIZE - page_offset; else to_copy = size - bytes_copied; if (!copy_from_iter_full((u8 *)va + page_offset, to_copy, from)) { if (kernel_if->host) kunmap_local(va); return VMCI_ERROR_INVALID_ARGS; } bytes_copied += to_copy; if (kernel_if->host) kunmap_local(va); } return VMCI_SUCCESS; } /* * Copies to a given buffer or iovector from a VMCI Queue. Uses * kmap_local_page() to dynamically map required portions of the queue * by traversing the offset -> page translation structure for the queue. * Assumes that offset + size does not wrap around in the queue. */ static int qp_memcpy_from_queue_iter(struct iov_iter *to, const struct vmci_queue *queue, u64 queue_offset, size_t size) { struct vmci_queue_kern_if *kernel_if = queue->kernel_if; size_t bytes_copied = 0; while (bytes_copied < size) { const u64 page_index = (queue_offset + bytes_copied) / PAGE_SIZE; const size_t page_offset = (queue_offset + bytes_copied) & (PAGE_SIZE - 1); void *va; size_t to_copy; int err; if (kernel_if->host) va = kmap_local_page(kernel_if->u.h.page[page_index]); else va = kernel_if->u.g.vas[page_index + 1]; /* Skip header. */ if (size - bytes_copied > PAGE_SIZE - page_offset) /* Enough payload to fill up this page. */ to_copy = PAGE_SIZE - page_offset; else to_copy = size - bytes_copied; err = copy_to_iter((u8 *)va + page_offset, to_copy, to); if (err != to_copy) { if (kernel_if->host) kunmap_local(va); return VMCI_ERROR_INVALID_ARGS; } bytes_copied += to_copy; if (kernel_if->host) kunmap_local(va); } return VMCI_SUCCESS; } /* * Allocates two list of PPNs --- one for the pages in the produce queue, * and the other for the pages in the consume queue. Intializes the list * of PPNs with the page frame numbers of the KVA for the two queues (and * the queue headers). */ static int qp_alloc_ppn_set(void *prod_q, u64 num_produce_pages, void *cons_q, u64 num_consume_pages, struct ppn_set *ppn_set) { u64 *produce_ppns; u64 *consume_ppns; struct vmci_queue *produce_q = prod_q; struct vmci_queue *consume_q = cons_q; u64 i; if (!produce_q || !num_produce_pages || !consume_q || !num_consume_pages || !ppn_set) return VMCI_ERROR_INVALID_ARGS; if (ppn_set->initialized) return VMCI_ERROR_ALREADY_EXISTS; produce_ppns = kmalloc_array(num_produce_pages, sizeof(*produce_ppns), GFP_KERNEL); if (!produce_ppns) return VMCI_ERROR_NO_MEM; consume_ppns = kmalloc_array(num_consume_pages, sizeof(*consume_ppns), GFP_KERNEL); if (!consume_ppns) { kfree(produce_ppns); return VMCI_ERROR_NO_MEM; } for (i = 0; i < num_produce_pages; i++) produce_ppns[i] = produce_q->kernel_if->u.g.pas[i] >> PAGE_SHIFT; for (i = 0; i < num_consume_pages; i++) consume_ppns[i] = consume_q->kernel_if->u.g.pas[i] >> PAGE_SHIFT; ppn_set->num_produce_pages = num_produce_pages; ppn_set->num_consume_pages = num_consume_pages; ppn_set->produce_ppns = produce_ppns; ppn_set->consume_ppns = consume_ppns; ppn_set->initialized = true; return VMCI_SUCCESS; } /* * Frees the two list of PPNs for a queue pair. */ static void qp_free_ppn_set(struct ppn_set *ppn_set) { if (ppn_set->initialized) { /* Do not call these functions on NULL inputs. */ kfree(ppn_set->produce_ppns); kfree(ppn_set->consume_ppns); } memset(ppn_set, 0, sizeof(*ppn_set)); } /* * Populates the list of PPNs in the hypercall structure with the PPNS * of the produce queue and the consume queue. */ static int qp_populate_ppn_set(u8 *call_buf, const struct ppn_set *ppn_set) { if (vmci_use_ppn64()) { memcpy(call_buf, ppn_set->produce_ppns, ppn_set->num_produce_pages * sizeof(*ppn_set->produce_ppns)); memcpy(call_buf + ppn_set->num_produce_pages * sizeof(*ppn_set->produce_ppns), ppn_set->consume_ppns, ppn_set->num_consume_pages * sizeof(*ppn_set->consume_ppns)); } else { int i; u32 *ppns = (u32 *) call_buf; for (i = 0; i < ppn_set->num_produce_pages; i++) ppns[i] = (u32) ppn_set->produce_ppns[i]; ppns = &ppns[ppn_set->num_produce_pages]; for (i = 0; i < ppn_set->num_consume_pages; i++) ppns[i] = (u32) ppn_set->consume_ppns[i]; } return VMCI_SUCCESS; } /* * Allocates kernel VA space of specified size plus space for the queue * and kernel interface. This is different from the guest queue allocator, * because we do not allocate our own queue header/data pages here but * share those of the guest. */ static struct vmci_queue *qp_host_alloc_queue(u64 size) { struct vmci_queue *queue; size_t queue_page_size; u64 num_pages; const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if)); if (size > min_t(size_t, VMCI_MAX_GUEST_QP_MEMORY, SIZE_MAX - PAGE_SIZE)) return NULL; num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / sizeof(*queue->kernel_if->u.h.page)) return NULL; queue_page_size = num_pages * sizeof(*queue->kernel_if->u.h.page); if (queue_size + queue_page_size > KMALLOC_MAX_SIZE) return NULL; queue = kzalloc(queue_size + queue_page_size, GFP_KERNEL); if (queue) { queue->q_header = NULL; queue->saved_header = NULL; queue->kernel_if = (struct vmci_queue_kern_if *)(queue + 1); queue->kernel_if->host = true; queue->kernel_if->mutex = NULL; queue->kernel_if->num_pages = num_pages; queue->kernel_if->u.h.header_page = (struct page **)((u8 *)queue + queue_size); queue->kernel_if->u.h.page = &queue->kernel_if->u.h.header_page[1]; } return queue; } /* * Frees kernel memory for a given queue (header plus translation * structure). */ static void qp_host_free_queue(struct vmci_queue *queue, u64 queue_size) { kfree(queue); } /* * Initialize the mutex for the pair of queues. This mutex is used to * protect the q_header and the buffer from changing out from under any * users of either queue. Of course, it's only any good if the mutexes * are actually acquired. Queue structure must lie on non-paged memory * or we cannot guarantee access to the mutex. */ static void qp_init_queue_mutex(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { /* * Only the host queue has shared state - the guest queues do not * need to synchronize access using a queue mutex. */ if (produce_q->kernel_if->host) { produce_q->kernel_if->mutex = &produce_q->kernel_if->__mutex; consume_q->kernel_if->mutex = &produce_q->kernel_if->__mutex; mutex_init(produce_q->kernel_if->mutex); } } /* * Cleans up the mutex for the pair of queues. */ static void qp_cleanup_queue_mutex(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { if (produce_q->kernel_if->host) { produce_q->kernel_if->mutex = NULL; consume_q->kernel_if->mutex = NULL; } } /* * Acquire the mutex for the queue. Note that the produce_q and * the consume_q share a mutex. So, only one of the two need to * be passed in to this routine. Either will work just fine. */ static void qp_acquire_queue_mutex(struct vmci_queue *queue) { if (queue->kernel_if->host) mutex_lock(queue->kernel_if->mutex); } /* * Release the mutex for the queue. Note that the produce_q and * the consume_q share a mutex. So, only one of the two need to * be passed in to this routine. Either will work just fine. */ static void qp_release_queue_mutex(struct vmci_queue *queue) { if (queue->kernel_if->host) mutex_unlock(queue->kernel_if->mutex); } /* * Helper function to release pages in the PageStoreAttachInfo * previously obtained using get_user_pages. */ static void qp_release_pages(struct page **pages, u64 num_pages, bool dirty) { int i; for (i = 0; i < num_pages; i++) { if (dirty) set_page_dirty_lock(pages[i]); put_page(pages[i]); pages[i] = NULL; } } /* * Lock the user pages referenced by the {produce,consume}Buffer * struct into memory and populate the {produce,consume}Pages * arrays in the attach structure with them. */ static int qp_host_get_user_memory(u64 produce_uva, u64 consume_uva, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int retval; int err = VMCI_SUCCESS; retval = get_user_pages_fast((uintptr_t) produce_uva, produce_q->kernel_if->num_pages, FOLL_WRITE, produce_q->kernel_if->u.h.header_page); if (retval < (int)produce_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(produce) failed (retval=%d)", retval); if (retval > 0) qp_release_pages(produce_q->kernel_if->u.h.header_page, retval, false); err = VMCI_ERROR_NO_MEM; goto out; } retval = get_user_pages_fast((uintptr_t) consume_uva, consume_q->kernel_if->num_pages, FOLL_WRITE, consume_q->kernel_if->u.h.header_page); if (retval < (int)consume_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(consume) failed (retval=%d)", retval); if (retval > 0) qp_release_pages(consume_q->kernel_if->u.h.header_page, retval, false); qp_release_pages(produce_q->kernel_if->u.h.header_page, produce_q->kernel_if->num_pages, false); err = VMCI_ERROR_NO_MEM; } out: return err; } /* * Registers the specification of the user pages used for backing a queue * pair. Enough information to map in pages is stored in the OS specific * part of the struct vmci_queue structure. */ static int qp_host_register_user_memory(struct vmci_qp_page_store *page_store, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { u64 produce_uva; u64 consume_uva; /* * The new style and the old style mapping only differs in * that we either get a single or two UVAs, so we split the * single UVA range at the appropriate spot. */ produce_uva = page_store->pages; consume_uva = page_store->pages + produce_q->kernel_if->num_pages * PAGE_SIZE; return qp_host_get_user_memory(produce_uva, consume_uva, produce_q, consume_q); } /* * Releases and removes the references to user pages stored in the attach * struct. Pages are released from the page cache and may become * swappable again. */ static void qp_host_unregister_user_memory(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { qp_release_pages(produce_q->kernel_if->u.h.header_page, produce_q->kernel_if->num_pages, true); memset(produce_q->kernel_if->u.h.header_page, 0, sizeof(*produce_q->kernel_if->u.h.header_page) * produce_q->kernel_if->num_pages); qp_release_pages(consume_q->kernel_if->u.h.header_page, consume_q->kernel_if->num_pages, true); memset(consume_q->kernel_if->u.h.header_page, 0, sizeof(*consume_q->kernel_if->u.h.header_page) * consume_q->kernel_if->num_pages); } /* * Once qp_host_register_user_memory has been performed on a * queue, the queue pair headers can be mapped into the * kernel. Once mapped, they must be unmapped with * qp_host_unmap_queues prior to calling * qp_host_unregister_user_memory. * Pages are pinned. */ static int qp_host_map_queues(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int result; if (!produce_q->q_header || !consume_q->q_header) { struct page *headers[2]; if (produce_q->q_header != consume_q->q_header) return VMCI_ERROR_QUEUEPAIR_MISMATCH; if (produce_q->kernel_if->u.h.header_page == NULL || *produce_q->kernel_if->u.h.header_page == NULL) return VMCI_ERROR_UNAVAILABLE; headers[0] = *produce_q->kernel_if->u.h.header_page; headers[1] = *consume_q->kernel_if->u.h.header_page; produce_q->q_header = vmap(headers, 2, VM_MAP, PAGE_KERNEL); if (produce_q->q_header != NULL) { consume_q->q_header = (struct vmci_queue_header *)((u8 *) produce_q->q_header + PAGE_SIZE); result = VMCI_SUCCESS; } else { pr_warn("vmap failed\n"); result = VMCI_ERROR_NO_MEM; } } else { result = VMCI_SUCCESS; } return result; } /* * Unmaps previously mapped queue pair headers from the kernel. * Pages are unpinned. */ static int qp_host_unmap_queues(u32 gid, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { if (produce_q->q_header) { if (produce_q->q_header < consume_q->q_header) vunmap(produce_q->q_header); else vunmap(consume_q->q_header); produce_q->q_header = NULL; consume_q->q_header = NULL; } return VMCI_SUCCESS; } /* * Finds the entry in the list corresponding to a given handle. Assumes * that the list is locked. */ static struct qp_entry *qp_list_find(struct qp_list *qp_list, struct vmci_handle handle) { struct qp_entry *entry; if (vmci_handle_is_invalid(handle)) return NULL; list_for_each_entry(entry, &qp_list->head, list_item) { if (vmci_handle_is_equal(entry->handle, handle)) return entry; } return NULL; } /* * Finds the entry in the list corresponding to a given handle. */ static struct qp_guest_endpoint * qp_guest_handle_to_entry(struct vmci_handle handle) { struct qp_guest_endpoint *entry; struct qp_entry *qp = qp_list_find(&qp_guest_endpoints, handle); entry = qp ? container_of( qp, struct qp_guest_endpoint, qp) : NULL; return entry; } /* * Finds the entry in the list corresponding to a given handle. */ static struct qp_broker_entry * qp_broker_handle_to_entry(struct vmci_handle handle) { struct qp_broker_entry *entry; struct qp_entry *qp = qp_list_find(&qp_broker_list, handle); entry = qp ? container_of( qp, struct qp_broker_entry, qp) : NULL; return entry; } /* * Dispatches a queue pair event message directly into the local event * queue. */ static int qp_notify_peer_local(bool attach, struct vmci_handle handle) { u32 context_id = vmci_get_context_id(); struct vmci_event_qp ev; memset(&ev, 0, sizeof(ev)); ev.msg.hdr.dst = vmci_make_handle(context_id, VMCI_EVENT_HANDLER); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); ev.msg.event_data.event = attach ? VMCI_EVENT_QP_PEER_ATTACH : VMCI_EVENT_QP_PEER_DETACH; ev.payload.peer_id = context_id; ev.payload.handle = handle; return vmci_event_dispatch(&ev.msg.hdr); } /* * Allocates and initializes a qp_guest_endpoint structure. * Allocates a queue_pair rid (and handle) iff the given entry has * an invalid handle. 0 through VMCI_RESERVED_RESOURCE_ID_MAX * are reserved handles. Assumes that the QP list mutex is held * by the caller. */ static struct qp_guest_endpoint * qp_guest_endpoint_create(struct vmci_handle handle, u32 peer, u32 flags, u64 produce_size, u64 consume_size, void *produce_q, void *consume_q) { int result; struct qp_guest_endpoint *entry; /* One page each for the queue headers. */ const u64 num_ppns = DIV_ROUND_UP(produce_size, PAGE_SIZE) + DIV_ROUND_UP(consume_size, PAGE_SIZE) + 2; if (vmci_handle_is_invalid(handle)) { u32 context_id = vmci_get_context_id(); handle = vmci_make_handle(context_id, VMCI_INVALID_ID); } entry = kzalloc_obj(*entry); if (entry) { entry->qp.peer = peer; entry->qp.flags = flags; entry->qp.produce_size = produce_size; entry->qp.consume_size = consume_size; entry->qp.ref_count = 0; entry->num_ppns = num_ppns; entry->produce_q = produce_q; entry->consume_q = consume_q; INIT_LIST_HEAD(&entry->qp.list_item); /* Add resource obj */ result = vmci_resource_add(&entry->resource, VMCI_RESOURCE_TYPE_QPAIR_GUEST, handle); entry->qp.handle = vmci_resource_handle(&entry->resource); if ((result != VMCI_SUCCESS) || qp_list_find(&qp_guest_endpoints, entry->qp.handle)) { pr_warn("Failed to add new resource (handle=0x%x:0x%x), error: %d", handle.context, handle.resource, result); kfree(entry); entry = NULL; } } return entry; } /* * Frees a qp_guest_endpoint structure. */ static void qp_guest_endpoint_destroy(struct qp_guest_endpoint *entry) { qp_free_ppn_set(&entry->ppn_set); qp_cleanup_queue_mutex(entry->produce_q, entry->consume_q); qp_free_queue(entry->produce_q, entry->qp.produce_size); qp_free_queue(entry->consume_q, entry->qp.consume_size); /* Unlink from resource hash table and free callback */ vmci_resource_remove(&entry->resource); kfree(entry); } /* * Helper to make a queue_pairAlloc hypercall when the driver is * supporting a guest device. */ static int qp_alloc_hypercall(const struct qp_guest_endpoint *entry) { struct vmci_qp_alloc_msg *alloc_msg; size_t msg_size; size_t ppn_size; int result; if (!entry || entry->num_ppns <= 2) return VMCI_ERROR_INVALID_ARGS; ppn_size = vmci_use_ppn64() ? sizeof(u64) : sizeof(u32); msg_size = sizeof(*alloc_msg) + (size_t) entry->num_ppns * ppn_size; alloc_msg = kmalloc(msg_size, GFP_KERNEL); if (!alloc_msg) return VMCI_ERROR_NO_MEM; alloc_msg->hdr.dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_QUEUEPAIR_ALLOC); alloc_msg->hdr.src = VMCI_ANON_SRC_HANDLE; alloc_msg->hdr.payload_size = msg_size - VMCI_DG_HEADERSIZE; alloc_msg->handle = entry->qp.handle; alloc_msg->peer = entry->qp.peer; alloc_msg->flags = entry->qp.flags; alloc_msg->produce_size = entry->qp.produce_size; alloc_msg->consume_size = entry->qp.consume_size; alloc_msg->num_ppns = entry->num_ppns; result = qp_populate_ppn_set((u8 *)alloc_msg + sizeof(*alloc_msg), &entry->ppn_set); if (result == VMCI_SUCCESS) result = vmci_send_datagram(&alloc_msg->hdr); kfree(alloc_msg); return result; } /* * Helper to make a queue_pairDetach hypercall when the driver is * supporting a guest device. */ static int qp_detatch_hypercall(struct vmci_handle handle) { struct vmci_qp_detach_msg detach_msg; detach_msg.hdr.dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_QUEUEPAIR_DETACH); detach_msg.hdr.src = VMCI_ANON_SRC_HANDLE; detach_msg.hdr.payload_size = sizeof(handle); detach_msg.handle = handle; return vmci_send_datagram(&detach_msg.hdr); } /* * Adds the given entry to the list. Assumes that the list is locked. */ static void qp_list_add_entry(struct qp_list *qp_list, struct qp_entry *entry) { if (entry) list_add(&entry->list_item, &qp_list->head); } /* * Removes the given entry from the list. Assumes that the list is locked. */ static void qp_list_remove_entry(struct qp_list *qp_list, struct qp_entry *entry) { if (entry) list_del(&entry->list_item); } /* * Helper for VMCI queue_pair detach interface. Frees the physical * pages for the queue pair. */ static int qp_detatch_guest_work(struct vmci_handle handle) { int result; struct qp_guest_endpoint *entry; u32 ref_count = ~0; /* To avoid compiler warning below */ mutex_lock(&qp_guest_endpoints.mutex); entry = qp_guest_handle_to_entry(handle); if (!entry) { mutex_unlock(&qp_guest_endpoints.mutex); return VMCI_ERROR_NOT_FOUND; } if (entry->qp.flags & VMCI_QPFLAG_LOCAL) { result = VMCI_SUCCESS; if (entry->qp.ref_count > 1) { result = qp_notify_peer_local(false, handle); /* * We can fail to notify a local queuepair * because we can't allocate. We still want * to release the entry if that happens, so * don't bail out yet. */ } } else { result = qp_detatch_hypercall(handle); if (result < VMCI_SUCCESS) { /* * We failed to notify a non-local queuepair. * That other queuepair might still be * accessing the shared memory, so don't * release the entry yet. It will get cleaned * up by VMCIqueue_pair_Exit() if necessary * (assuming we are going away, otherwise why * did this fail?). */ mutex_unlock(&qp_guest_endpoints.mutex); return result; } } /* * If we get here then we either failed to notify a local queuepair, or * we succeeded in all cases. Release the entry if required. */ entry->qp.ref_count--; if (entry->qp.ref_count == 0) qp_list_remove_entry(&qp_guest_endpoints, &entry->qp); /* If we didn't remove the entry, this could change once we unlock. */ if (entry) ref_count = entry->qp.ref_count; mutex_unlock(&qp_guest_endpoints.mutex); if (ref_count == 0) qp_guest_endpoint_destroy(entry); return result; } /* * This functions handles the actual allocation of a VMCI queue * pair guest endpoint. Allocates physical pages for the queue * pair. It makes OS dependent calls through generic wrappers. */ static int qp_alloc_guest_work(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags) { const u64 num_produce_pages = DIV_ROUND_UP(produce_size, PAGE_SIZE) + 1; const u64 num_consume_pages = DIV_ROUND_UP(consume_size, PAGE_SIZE) + 1; void *my_produce_q = NULL; void *my_consume_q = NULL; int result; struct qp_guest_endpoint *queue_pair_entry = NULL; if (priv_flags != VMCI_NO_PRIVILEGE_FLAGS) return VMCI_ERROR_NO_ACCESS; mutex_lock(&qp_guest_endpoints.mutex); queue_pair_entry = qp_guest_handle_to_entry(*handle); if (queue_pair_entry) { if (queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) { /* Local attach case. */ if (queue_pair_entry->qp.ref_count > 1) { pr_devel("Error attempting to attach more than once\n"); result = VMCI_ERROR_UNAVAILABLE; goto error_keep_entry; } if (queue_pair_entry->qp.produce_size != consume_size || queue_pair_entry->qp.consume_size != produce_size || queue_pair_entry->qp.flags != (flags & ~VMCI_QPFLAG_ATTACH_ONLY)) { pr_devel("Error mismatched queue pair in local attach\n"); result = VMCI_ERROR_QUEUEPAIR_MISMATCH; goto error_keep_entry; } /* * Do a local attach. We swap the consume and * produce queues for the attacher and deliver * an attach event. */ result = qp_notify_peer_local(true, *handle); if (result < VMCI_SUCCESS) goto error_keep_entry; my_produce_q = queue_pair_entry->consume_q; my_consume_q = queue_pair_entry->produce_q; goto out; } result = VMCI_ERROR_ALREADY_EXISTS; goto error_keep_entry; } my_produce_q = qp_alloc_queue(produce_size, flags); if (!my_produce_q) { pr_warn("Error allocating pages for produce queue\n"); result = VMCI_ERROR_NO_MEM; goto error; } my_consume_q = qp_alloc_queue(consume_size, flags); if (!my_consume_q) { pr_warn("Error allocating pages for consume queue\n"); result = VMCI_ERROR_NO_MEM; goto error; } queue_pair_entry = qp_guest_endpoint_create(*handle, peer, flags, produce_size, consume_size, my_produce_q, my_consume_q); if (!queue_pair_entry) { pr_warn("Error allocating memory in %s\n", __func__); result = VMCI_ERROR_NO_MEM; goto error; } result = qp_alloc_ppn_set(my_produce_q, num_produce_pages, my_consume_q, num_consume_pages, &queue_pair_entry->ppn_set); if (result < VMCI_SUCCESS) { pr_warn("qp_alloc_ppn_set failed\n"); goto error; } /* * It's only necessary to notify the host if this queue pair will be * attached to from another context. */ if (queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) { /* Local create case. */ u32 context_id = vmci_get_context_id(); /* * Enforce similar checks on local queue pairs as we * do for regular ones. The handle's context must * match the creator or attacher context id (here they * are both the current context id) and the * attach-only flag cannot exist during create. We * also ensure specified peer is this context or an * invalid one. */ if (queue_pair_entry->qp.handle.context != context_id || (queue_pair_entry->qp.peer != VMCI_INVALID_ID && queue_pair_entry->qp.peer != context_id)) { result = VMCI_ERROR_NO_ACCESS; goto error; } if (queue_pair_entry->qp.flags & VMCI_QPFLAG_ATTACH_ONLY) { result = VMCI_ERROR_NOT_FOUND; goto error; } } else { result = qp_alloc_hypercall(queue_pair_entry); if (result < VMCI_SUCCESS) { pr_devel("qp_alloc_hypercall result = %d\n", result); goto error; } } qp_init_queue_mutex((struct vmci_queue *)my_produce_q, (struct vmci_queue *)my_consume_q); qp_list_add_entry(&qp_guest_endpoints, &queue_pair_entry->qp); out: queue_pair_entry->qp.ref_count++; *handle = queue_pair_entry->qp.handle; *produce_q = (struct vmci_queue *)my_produce_q; *consume_q = (struct vmci_queue *)my_consume_q; /* * We should initialize the queue pair header pages on a local * queue pair create. For non-local queue pairs, the * hypervisor initializes the header pages in the create step. */ if ((queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) && queue_pair_entry->qp.ref_count == 1) { vmci_q_header_init((*produce_q)->q_header, *handle); vmci_q_header_init((*consume_q)->q_header, *handle); } mutex_unlock(&qp_guest_endpoints.mutex); return VMCI_SUCCESS; error: mutex_unlock(&qp_guest_endpoints.mutex); if (queue_pair_entry) { /* The queues will be freed inside the destroy routine. */ qp_guest_endpoint_destroy(queue_pair_entry); } else { qp_free_queue(my_produce_q, produce_size); qp_free_queue(my_consume_q, consume_size); } return result; error_keep_entry: /* This path should only be used when an existing entry was found. */ mutex_unlock(&qp_guest_endpoints.mutex); return result; } /* * The first endpoint issuing a queue pair allocation will create the state * of the queue pair in the queue pair broker. * * If the creator is a guest, it will associate a VMX virtual address range * with the queue pair as specified by the page_store. For compatibility with * older VMX'en, that would use a separate step to set the VMX virtual * address range, the virtual address range can be registered later using * vmci_qp_broker_set_page_store. In that case, a page_store of NULL should be * used. * * If the creator is the host, a page_store of NULL should be used as well, * since the host is not able to supply a page store for the queue pair. * * For older VMX and host callers, the queue pair will be created in the * VMCIQPB_CREATED_NO_MEM state, and for current VMX callers, it will be * created in VMCOQPB_CREATED_MEM state. */ static int qp_broker_create(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent) { struct qp_broker_entry *entry = NULL; const u32 context_id = vmci_ctx_get_id(context); bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; u64 guest_produce_size; u64 guest_consume_size; /* Do not create if the caller asked not to. */ if (flags & VMCI_QPFLAG_ATTACH_ONLY) return VMCI_ERROR_NOT_FOUND; /* * Creator's context ID should match handle's context ID or the creator * must allow the context in handle's context ID as the "peer". */ if (handle.context != context_id && handle.context != peer) return VMCI_ERROR_NO_ACCESS; if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(peer)) return VMCI_ERROR_DST_UNREACHABLE; /* * Creator's context ID for local queue pairs should match the * peer, if a peer is specified. */ if (is_local && peer != VMCI_INVALID_ID && context_id != peer) return VMCI_ERROR_NO_ACCESS; entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) return VMCI_ERROR_NO_MEM; if (vmci_ctx_get_id(context) == VMCI_HOST_CONTEXT_ID && !is_local) { /* * The queue pair broker entry stores values from the guest * point of view, so a creating host side endpoint should swap * produce and consume values -- unless it is a local queue * pair, in which case no swapping is necessary, since the local * attacher will swap queues. */ guest_produce_size = consume_size; guest_consume_size = produce_size; } else { guest_produce_size = produce_size; guest_consume_size = consume_size; } entry->qp.handle = handle; entry->qp.peer = peer; entry->qp.flags = flags; entry->qp.produce_size = guest_produce_size; entry->qp.consume_size = guest_consume_size; entry->qp.ref_count = 1; entry->create_id = context_id; entry->attach_id = VMCI_INVALID_ID; entry->state = VMCIQPB_NEW; entry->require_trusted_attach = !!(context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED); entry->created_by_trusted = !!(priv_flags & VMCI_PRIVILEGE_FLAG_TRUSTED); entry->vmci_page_files = false; entry->wakeup_cb = wakeup_cb; entry->client_data = client_data; entry->produce_q = qp_host_alloc_queue(guest_produce_size); if (entry->produce_q == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } entry->consume_q = qp_host_alloc_queue(guest_consume_size); if (entry->consume_q == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } qp_init_queue_mutex(entry->produce_q, entry->consume_q); INIT_LIST_HEAD(&entry->qp.list_item); if (is_local) { u8 *tmp; entry->local_mem = kcalloc(QPE_NUM_PAGES(entry->qp), PAGE_SIZE, GFP_KERNEL); if (entry->local_mem == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } entry->state = VMCIQPB_CREATED_MEM; entry->produce_q->q_header = entry->local_mem; tmp = (u8 *)entry->local_mem + PAGE_SIZE * (DIV_ROUND_UP(entry->qp.produce_size, PAGE_SIZE) + 1); entry->consume_q->q_header = (struct vmci_queue_header *)tmp; } else if (page_store) { /* * The VMX already initialized the queue pair headers, so no * need for the kernel side to do that. */ result = qp_host_register_user_memory(page_store, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) goto error; entry->state = VMCIQPB_CREATED_MEM; } else { /* * A create without a page_store may be either a host * side create (in which case we are waiting for the * guest side to supply the memory) or an old style * queue pair create (in which case we will expect a * set page store call as the next step). */ entry->state = VMCIQPB_CREATED_NO_MEM; } qp_list_add_entry(&qp_broker_list, &entry->qp); if (ent != NULL) *ent = entry; /* Add to resource obj */ result = vmci_resource_add(&entry->resource, VMCI_RESOURCE_TYPE_QPAIR_HOST, handle); if (result != VMCI_SUCCESS) { pr_warn("Failed to add new resource (handle=0x%x:0x%x), error: %d", handle.context, handle.resource, result); goto error; } entry->qp.handle = vmci_resource_handle(&entry->resource); if (is_local) { vmci_q_header_init(entry->produce_q->q_header, entry->qp.handle); vmci_q_header_init(entry->consume_q->q_header, entry->qp.handle); } vmci_ctx_qp_create(context, entry->qp.handle); return VMCI_SUCCESS; error: if (entry != NULL) { qp_host_free_queue(entry->produce_q, guest_produce_size); qp_host_free_queue(entry->consume_q, guest_consume_size); kfree(entry); } return result; } /* * Enqueues an event datagram to notify the peer VM attached to * the given queue pair handle about attach/detach event by the * given VM. Returns Payload size of datagram enqueued on * success, error code otherwise. */ static int qp_notify_peer(bool attach, struct vmci_handle handle, u32 my_id, u32 peer_id) { int rv; struct vmci_event_qp ev; if (vmci_handle_is_invalid(handle) || my_id == VMCI_INVALID_ID || peer_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; /* * In vmci_ctx_enqueue_datagram() we enforce the upper limit on * number of pending events from the hypervisor to a given VM * otherwise a rogue VM could do an arbitrary number of attach * and detach operations causing memory pressure in the host * kernel. */ memset(&ev, 0, sizeof(ev)); ev.msg.hdr.dst = vmci_make_handle(peer_id, VMCI_EVENT_HANDLER); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); ev.msg.event_data.event = attach ? VMCI_EVENT_QP_PEER_ATTACH : VMCI_EVENT_QP_PEER_DETACH; ev.payload.handle = handle; ev.payload.peer_id = my_id; rv = vmci_datagram_dispatch(VMCI_HYPERVISOR_CONTEXT_ID, &ev.msg.hdr, false); if (rv < VMCI_SUCCESS) pr_warn("Failed to enqueue queue_pair %s event datagram for context (ID=0x%x)\n", attach ? "ATTACH" : "DETACH", peer_id); return rv; } /* * The second endpoint issuing a queue pair allocation will attach to * the queue pair registered with the queue pair broker. * * If the attacher is a guest, it will associate a VMX virtual address * range with the queue pair as specified by the page_store. At this * point, the already attach host endpoint may start using the queue * pair, and an attach event is sent to it. For compatibility with * older VMX'en, that used a separate step to set the VMX virtual * address range, the virtual address range can be registered later * using vmci_qp_broker_set_page_store. In that case, a page_store of * NULL should be used, and the attach event will be generated once * the actual page store has been set. * * If the attacher is the host, a page_store of NULL should be used as * well, since the page store information is already set by the guest. * * For new VMX and host callers, the queue pair will be moved to the * VMCIQPB_ATTACHED_MEM state, and for older VMX callers, it will be * moved to the VMCOQPB_ATTACHED_NO_MEM state. */ static int qp_broker_attach(struct qp_broker_entry *entry, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent) { const u32 context_id = vmci_ctx_get_id(context); bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; if (entry->state != VMCIQPB_CREATED_NO_MEM && entry->state != VMCIQPB_CREATED_MEM) return VMCI_ERROR_UNAVAILABLE; if (is_local) { if (!(entry->qp.flags & VMCI_QPFLAG_LOCAL) || context_id != entry->create_id) { return VMCI_ERROR_INVALID_ARGS; } } else if (context_id == entry->create_id || context_id == entry->attach_id) { return VMCI_ERROR_ALREADY_EXISTS; } if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(entry->create_id)) return VMCI_ERROR_DST_UNREACHABLE; /* * If we are attaching from a restricted context then the queuepair * must have been created by a trusted endpoint. */ if ((context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED) && !entry->created_by_trusted) return VMCI_ERROR_NO_ACCESS; /* * If we are attaching to a queuepair that was created by a restricted * context then we must be trusted. */ if (entry->require_trusted_attach && (!(priv_flags & VMCI_PRIVILEGE_FLAG_TRUSTED))) return VMCI_ERROR_NO_ACCESS; /* * If the creator specifies VMCI_INVALID_ID in "peer" field, access * control check is not performed. */ if (entry->qp.peer != VMCI_INVALID_ID && entry->qp.peer != context_id) return VMCI_ERROR_NO_ACCESS; if (entry->create_id == VMCI_HOST_CONTEXT_ID) { /* * Do not attach if the caller doesn't support Host Queue Pairs * and a host created this queue pair. */ if (!vmci_ctx_supports_host_qp(context)) return VMCI_ERROR_INVALID_RESOURCE; } else if (context_id == VMCI_HOST_CONTEXT_ID) { struct vmci_ctx *create_context; bool supports_host_qp; /* * Do not attach a host to a user created queue pair if that * user doesn't support host queue pair end points. */ create_context = vmci_ctx_get(entry->create_id); supports_host_qp = vmci_ctx_supports_host_qp(create_context); vmci_ctx_put(create_context); if (!supports_host_qp) return VMCI_ERROR_INVALID_RESOURCE; } if ((entry->qp.flags & ~VMCI_QP_ASYMM) != (flags & ~VMCI_QP_ASYMM_PEER)) return VMCI_ERROR_QUEUEPAIR_MISMATCH; if (context_id != VMCI_HOST_CONTEXT_ID) { /* * The queue pair broker entry stores values from the guest * point of view, so an attaching guest should match the values * stored in the entry. */ if (entry->qp.produce_size != produce_size || entry->qp.consume_size != consume_size) { return VMCI_ERROR_QUEUEPAIR_MISMATCH; } } else if (entry->qp.produce_size != consume_size || entry->qp.consume_size != produce_size) { return VMCI_ERROR_QUEUEPAIR_MISMATCH; } if (context_id != VMCI_HOST_CONTEXT_ID) { /* * If a guest attached to a queue pair, it will supply * the backing memory. If this is a pre NOVMVM vmx, * the backing memory will be supplied by calling * vmci_qp_broker_set_page_store() following the * return of the vmci_qp_broker_alloc() call. If it is * a vmx of version NOVMVM or later, the page store * must be supplied as part of the * vmci_qp_broker_alloc call. Under all circumstances * must the initially created queue pair not have any * memory associated with it already. */ if (entry->state != VMCIQPB_CREATED_NO_MEM) return VMCI_ERROR_INVALID_ARGS; if (page_store != NULL) { /* * Patch up host state to point to guest * supplied memory. The VMX already * initialized the queue pair headers, so no * need for the kernel side to do that. */ result = qp_host_register_user_memory(page_store, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) return result; entry->state = VMCIQPB_ATTACHED_MEM; } else { entry->state = VMCIQPB_ATTACHED_NO_MEM; } } else if (entry->state == VMCIQPB_CREATED_NO_MEM) { /* * The host side is attempting to attach to a queue * pair that doesn't have any memory associated with * it. This must be a pre NOVMVM vmx that hasn't set * the page store information yet, or a quiesced VM. */ return VMCI_ERROR_UNAVAILABLE; } else { /* The host side has successfully attached to a queue pair. */ entry->state = VMCIQPB_ATTACHED_MEM; } if (entry->state == VMCIQPB_ATTACHED_MEM) { result = qp_notify_peer(true, entry->qp.handle, context_id, entry->create_id); if (result < VMCI_SUCCESS) pr_warn("Failed to notify peer (ID=0x%x) of attach to queue pair (handle=0x%x:0x%x)\n", entry->create_id, entry->qp.handle.context, entry->qp.handle.resource); } entry->attach_id = context_id; entry->qp.ref_count++; if (wakeup_cb) { entry->wakeup_cb = wakeup_cb; entry->client_data = client_data; } /* * When attaching to local queue pairs, the context already has * an entry tracking the queue pair, so don't add another one. */ if (!is_local) vmci_ctx_qp_create(context, entry->qp.handle); if (ent != NULL) *ent = entry; return VMCI_SUCCESS; } /* * queue_pair_Alloc for use when setting up queue pair endpoints * on the host. */ static int qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent, bool *swap) { const u32 context_id = vmci_ctx_get_id(context); bool create; struct qp_broker_entry *entry = NULL; bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; if (vmci_handle_is_invalid(handle) || (flags & ~VMCI_QP_ALL_FLAGS) || is_local || !(produce_size || consume_size) || !context || context_id == VMCI_INVALID_ID || handle.context == VMCI_INVALID_ID) { return VMCI_ERROR_INVALID_ARGS; } if (page_store && !VMCI_QP_PAGESTORE_IS_WELLFORMED(page_store)) return VMCI_ERROR_INVALID_ARGS; /* * In the initial argument check, we ensure that non-vmkernel hosts * are not allowed to create local queue pairs. */ mutex_lock(&qp_broker_list.mutex); if (!is_local && vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) already attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); mutex_unlock(&qp_broker_list.mutex); return VMCI_ERROR_ALREADY_EXISTS; } if (handle.resource != VMCI_INVALID_ID) entry = qp_broker_handle_to_entry(handle); if (!entry) { create = true; result = qp_broker_create(handle, peer, flags, priv_flags, produce_size, consume_size, page_store, context, wakeup_cb, client_data, ent); } else { create = false; result = qp_broker_attach(entry, peer, flags, priv_flags, produce_size, consume_size, page_store, context, wakeup_cb, client_data, ent); } mutex_unlock(&qp_broker_list.mutex); if (swap) *swap = (context_id == VMCI_HOST_CONTEXT_ID) && !(create && is_local); return result; } /* * This function implements the kernel API for allocating a queue * pair. */ static int qp_alloc_host_work(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, vmci_event_release_cb wakeup_cb, void *client_data) { struct vmci_handle new_handle; struct vmci_ctx *context; struct qp_broker_entry *entry; int result; bool swap; if (vmci_handle_is_invalid(*handle)) { new_handle = vmci_make_handle( VMCI_HOST_CONTEXT_ID, VMCI_INVALID_ID); } else new_handle = *handle; context = vmci_ctx_get(VMCI_HOST_CONTEXT_ID); entry = NULL; result = qp_broker_alloc(new_handle, peer, flags, priv_flags, produce_size, consume_size, NULL, context, wakeup_cb, client_data, &entry, &swap); if (result == VMCI_SUCCESS) { if (swap) { /* * If this is a local queue pair, the attacher * will swap around produce and consume * queues. */ *produce_q = entry->consume_q; *consume_q = entry->produce_q; } else { *produce_q = entry->produce_q; *consume_q = entry->consume_q; } *handle = vmci_resource_handle(&entry->resource); } else { *handle = VMCI_INVALID_HANDLE; pr_devel("queue pair broker failed to alloc (result=%d)\n", result); } vmci_ctx_put(context); return result; } /* * Allocates a VMCI queue_pair. Only checks validity of input * arguments. The real work is done in the host or guest * specific function. */ int vmci_qp_alloc(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, bool guest_endpoint, vmci_event_release_cb wakeup_cb, void *client_data) { if (!handle || !produce_q || !consume_q || (!produce_size && !consume_size) || (flags & ~VMCI_QP_ALL_FLAGS)) return VMCI_ERROR_INVALID_ARGS; if (guest_endpoint) { return qp_alloc_guest_work(handle, produce_q, produce_size, consume_q, consume_size, peer, flags, priv_flags); } else { return qp_alloc_host_work(handle, produce_q, produce_size, consume_q, consume_size, peer, flags, priv_flags, wakeup_cb, client_data); } } /* * This function implements the host kernel API for detaching from * a queue pair. */ static int qp_detatch_host_work(struct vmci_handle handle) { int result; struct vmci_ctx *context; context = vmci_ctx_get(VMCI_HOST_CONTEXT_ID); result = vmci_qp_broker_detach(handle, context); vmci_ctx_put(context); return result; } /* * Detaches from a VMCI queue_pair. Only checks validity of input argument. * Real work is done in the host or guest specific function. */ static int qp_detatch(struct vmci_handle handle, bool guest_endpoint) { if (vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; if (guest_endpoint) return qp_detatch_guest_work(handle); else return qp_detatch_host_work(handle); } /* * Returns the entry from the head of the list. Assumes that the list is * locked. */ static struct qp_entry *qp_list_get_head(struct qp_list *qp_list) { if (!list_empty(&qp_list->head)) { struct qp_entry *entry = list_first_entry(&qp_list->head, struct qp_entry, list_item); return entry; } return NULL; } void vmci_qp_broker_exit(void) { struct qp_entry *entry; struct qp_broker_entry *be; mutex_lock(&qp_broker_list.mutex); while ((entry = qp_list_get_head(&qp_broker_list))) { be = (struct qp_broker_entry *)entry; qp_list_remove_entry(&qp_broker_list, entry); kfree(be); } mutex_unlock(&qp_broker_list.mutex); } /* * Requests that a queue pair be allocated with the VMCI queue * pair broker. Allocates a queue pair entry if one does not * exist. Attaches to one if it exists, and retrieves the page * files backing that queue_pair. Assumes that the queue pair * broker lock is held. */ int vmci_qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context) { if (!QP_SIZES_ARE_VALID(produce_size, consume_size)) return VMCI_ERROR_NO_RESOURCES; return qp_broker_alloc(handle, peer, flags, priv_flags, produce_size, consume_size, page_store, context, NULL, NULL, NULL, NULL); } /* * VMX'en with versions lower than VMCI_VERSION_NOVMVM use a separate * step to add the UVAs of the VMX mapping of the queue pair. This function * provides backwards compatibility with such VMX'en, and takes care of * registering the page store for a queue pair previously allocated by the * VMX during create or attach. This function will move the queue pair state * to either from VMCIQBP_CREATED_NO_MEM to VMCIQBP_CREATED_MEM or * VMCIQBP_ATTACHED_NO_MEM to VMCIQBP_ATTACHED_MEM. If moving to the * attached state with memory, the queue pair is ready to be used by the * host peer, and an attached event will be generated. * * Assumes that the queue pair broker lock is held. * * This function is only used by the hosted platform, since there is no * issue with backwards compatibility for vmkernel. */ int vmci_qp_broker_set_page_store(struct vmci_handle handle, u64 produce_uva, u64 consume_uva, struct vmci_ctx *context) { struct qp_broker_entry *entry; int result; const u32 context_id = vmci_ctx_get_id(context); if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; /* * We only support guest to host queue pairs, so the VMX must * supply UVAs for the mapped page files. */ if (produce_uva == 0 || consume_uva == 0) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_warn("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { result = VMCI_ERROR_NOT_FOUND; goto out; } /* * If I'm the owner then I can set the page store. * * Or, if a host created the queue_pair and I'm the attached peer * then I can set the page store. */ if (entry->create_id != context_id && (entry->create_id != VMCI_HOST_CONTEXT_ID || entry->attach_id != context_id)) { result = VMCI_ERROR_QUEUEPAIR_NOTOWNER; goto out; } if (entry->state != VMCIQPB_CREATED_NO_MEM && entry->state != VMCIQPB_ATTACHED_NO_MEM) { result = VMCI_ERROR_UNAVAILABLE; goto out; } result = qp_host_get_user_memory(produce_uva, consume_uva, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) goto out; result = qp_host_map_queues(entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) { qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); goto out; } if (entry->state == VMCIQPB_CREATED_NO_MEM) entry->state = VMCIQPB_CREATED_MEM; else entry->state = VMCIQPB_ATTACHED_MEM; entry->vmci_page_files = true; if (entry->state == VMCIQPB_ATTACHED_MEM) { result = qp_notify_peer(true, handle, context_id, entry->create_id); if (result < VMCI_SUCCESS) { pr_warn("Failed to notify peer (ID=0x%x) of attach to queue pair (handle=0x%x:0x%x)\n", entry->create_id, entry->qp.handle.context, entry->qp.handle.resource); } } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Resets saved queue headers for the given QP broker * entry. Should be used when guest memory becomes available * again, or the guest detaches. */ static void qp_reset_saved_headers(struct qp_broker_entry *entry) { entry->produce_q->saved_header = NULL; entry->consume_q->saved_header = NULL; } /* * The main entry point for detaching from a queue pair registered with the * queue pair broker. If more than one endpoint is attached to the queue * pair, the first endpoint will mainly decrement a reference count and * generate a notification to its peer. The last endpoint will clean up * the queue pair state registered with the broker. * * When a guest endpoint detaches, it will unmap and unregister the guest * memory backing the queue pair. If the host is still attached, it will * no longer be able to access the queue pair content. * * If the queue pair is already in a state where there is no memory * registered for the queue pair (any *_NO_MEM state), it will transition to * the VMCIQPB_SHUTDOWN_NO_MEM state. This will also happen, if a guest * endpoint is the first of two endpoints to detach. If the host endpoint is * the first out of two to detach, the queue pair will move to the * VMCIQPB_SHUTDOWN_MEM state. */ int vmci_qp_broker_detach(struct vmci_handle handle, struct vmci_ctx *context) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); u32 peer_id; bool is_local = false; int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) { return VMCI_ERROR_INVALID_ARGS; } mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair(handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } if (context_id == entry->create_id) { peer_id = entry->attach_id; entry->create_id = VMCI_INVALID_ID; } else { peer_id = entry->create_id; entry->attach_id = VMCI_INVALID_ID; } entry->qp.ref_count--; is_local = entry->qp.flags & VMCI_QPFLAG_LOCAL; if (context_id != VMCI_HOST_CONTEXT_ID) { bool headers_mapped; /* * Pre NOVMVM vmx'en may detach from a queue pair * before setting the page store, and in that case * there is no user memory to detach from. Also, more * recent VMX'en may detach from a queue pair in the * quiesced state. */ qp_acquire_queue_mutex(entry->produce_q); headers_mapped = entry->produce_q->q_header || entry->consume_q->q_header; if (QPBROKERSTATE_HAS_MEM(entry)) { result = qp_host_unmap_queues(INVALID_VMCI_GUEST_MEM_ID, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) pr_warn("Failed to unmap queue headers for queue pair (handle=0x%x:0x%x,result=%d)\n", handle.context, handle.resource, result); qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); } if (!headers_mapped) qp_reset_saved_headers(entry); qp_release_queue_mutex(entry->produce_q); if (!headers_mapped && entry->wakeup_cb) entry->wakeup_cb(entry->client_data); } else { if (entry->wakeup_cb) { entry->wakeup_cb = NULL; entry->client_data = NULL; } } if (entry->qp.ref_count == 0) { qp_list_remove_entry(&qp_broker_list, &entry->qp); if (is_local) kfree(entry->local_mem); qp_cleanup_queue_mutex(entry->produce_q, entry->consume_q); qp_host_free_queue(entry->produce_q, entry->qp.produce_size); qp_host_free_queue(entry->consume_q, entry->qp.consume_size); /* Unlink from resource hash table and free callback */ vmci_resource_remove(&entry->resource); kfree(entry); vmci_ctx_qp_destroy(context, handle); } else { qp_notify_peer(false, handle, context_id, peer_id); if (context_id == VMCI_HOST_CONTEXT_ID && QPBROKERSTATE_HAS_MEM(entry)) { entry->state = VMCIQPB_SHUTDOWN_MEM; } else { entry->state = VMCIQPB_SHUTDOWN_NO_MEM; } if (!is_local) vmci_ctx_qp_destroy(context, handle); } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Establishes the necessary mappings for a queue pair given a * reference to the queue pair guest memory. This is usually * called when a guest is unquiesced and the VMX is allowed to * map guest memory once again. */ int vmci_qp_broker_map(struct vmci_handle handle, struct vmci_ctx *context, u64 guest_mem) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair (handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } result = VMCI_SUCCESS; if (context_id != VMCI_HOST_CONTEXT_ID && !QPBROKERSTATE_HAS_MEM(entry)) { struct vmci_qp_page_store page_store; page_store.pages = guest_mem; page_store.len = QPE_NUM_PAGES(entry->qp); qp_acquire_queue_mutex(entry->produce_q); qp_reset_saved_headers(entry); result = qp_host_register_user_memory(&page_store, entry->produce_q, entry->consume_q); qp_release_queue_mutex(entry->produce_q); if (result == VMCI_SUCCESS) { /* Move state from *_NO_MEM to *_MEM */ entry->state++; if (entry->wakeup_cb) entry->wakeup_cb(entry->client_data); } } out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Saves a snapshot of the queue headers for the given QP broker * entry. Should be used when guest memory is unmapped. * Results: * VMCI_SUCCESS on success, appropriate error code if guest memory * can't be accessed.. */ static int qp_save_headers(struct qp_broker_entry *entry) { int result; if (entry->produce_q->saved_header != NULL && entry->consume_q->saved_header != NULL) { /* * If the headers have already been saved, we don't need to do * it again, and we don't want to map in the headers * unnecessarily. */ return VMCI_SUCCESS; } if (NULL == entry->produce_q->q_header || NULL == entry->consume_q->q_header) { result = qp_host_map_queues(entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) return result; } memcpy(&entry->saved_produce_q, entry->produce_q->q_header, sizeof(entry->saved_produce_q)); entry->produce_q->saved_header = &entry->saved_produce_q; memcpy(&entry->saved_consume_q, entry->consume_q->q_header, sizeof(entry->saved_consume_q)); entry->consume_q->saved_header = &entry->saved_consume_q; return VMCI_SUCCESS; } /* * Removes all references to the guest memory of a given queue pair, and * will move the queue pair from state *_MEM to *_NO_MEM. It is usually * called when a VM is being quiesced where access to guest memory should * avoided. */ int vmci_qp_broker_unmap(struct vmci_handle handle, struct vmci_ctx *context, u32 gid) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair (handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } if (context_id != VMCI_HOST_CONTEXT_ID && QPBROKERSTATE_HAS_MEM(entry)) { qp_acquire_queue_mutex(entry->produce_q); result = qp_save_headers(entry); if (result < VMCI_SUCCESS) pr_warn("Failed to save queue headers for queue pair (handle=0x%x:0x%x,result=%d)\n", handle.context, handle.resource, result); qp_host_unmap_queues(gid, entry->produce_q, entry->consume_q); /* * On hosted, when we unmap queue pairs, the VMX will also * unmap the guest memory, so we invalidate the previously * registered memory. If the queue pair is mapped again at a * later point in time, we will need to reregister the user * memory with a possibly new user VA. */ qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); /* * Move state from *_MEM to *_NO_MEM. */ entry->state--; qp_release_queue_mutex(entry->produce_q); } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Destroys all guest queue pair endpoints. If active guest queue * pairs still exist, hypercalls to attempt detach from these * queue pairs will be made. Any failure to detach is silently * ignored. */ void vmci_qp_guest_endpoints_exit(void) { struct qp_entry *entry; struct qp_guest_endpoint *ep; mutex_lock(&qp_guest_endpoints.mutex); while ((entry = qp_list_get_head(&qp_guest_endpoints))) { ep = (struct qp_guest_endpoint *)entry; /* Don't make a hypercall for local queue_pairs. */ if (!(entry->flags & VMCI_QPFLAG_LOCAL)) qp_detatch_hypercall(entry->handle); /* We cannot fail the exit, so let's reset ref_count. */ entry->ref_count = 0; qp_list_remove_entry(&qp_guest_endpoints, entry); qp_guest_endpoint_destroy(ep); } mutex_unlock(&qp_guest_endpoints.mutex); } /* * Helper routine that will lock the queue pair before subsequent * operations. * Note: Non-blocking on the host side is currently only implemented in ESX. * Since non-blocking isn't yet implemented on the host personality we * have no reason to acquire a spin lock. So to avoid the use of an * unnecessary lock only acquire the mutex if we can block. */ static void qp_lock(const struct vmci_qp *qpair) { qp_acquire_queue_mutex(qpair->produce_q); } /* * Helper routine that unlocks the queue pair after calling * qp_lock. */ static void qp_unlock(const struct vmci_qp *qpair) { qp_release_queue_mutex(qpair->produce_q); } /* * The queue headers may not be mapped at all times. If a queue is * currently not mapped, it will be attempted to do so. */ static int qp_map_queue_headers(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int result; if (NULL == produce_q->q_header || NULL == consume_q->q_header) { result = qp_host_map_queues(produce_q, consume_q); if (result < VMCI_SUCCESS) return (produce_q->saved_header && consume_q->saved_header) ? VMCI_ERROR_QUEUEPAIR_NOT_READY : VMCI_ERROR_QUEUEPAIR_NOTATTACHED; } return VMCI_SUCCESS; } /* * Helper routine that will retrieve the produce and consume * headers of a given queue pair. If the guest memory of the * queue pair is currently not available, the saved queue headers * will be returned, if these are available. */ static int qp_get_queue_headers(const struct vmci_qp *qpair, struct vmci_queue_header **produce_q_header, struct vmci_queue_header **consume_q_header) { int result; result = qp_map_queue_headers(qpair->produce_q, qpair->consume_q); if (result == VMCI_SUCCESS) { *produce_q_header = qpair->produce_q->q_header; *consume_q_header = qpair->consume_q->q_header; } else if (qpair->produce_q->saved_header && qpair->consume_q->saved_header) { *produce_q_header = qpair->produce_q->saved_header; *consume_q_header = qpair->consume_q->saved_header; result = VMCI_SUCCESS; } return result; } /* * Callback from VMCI queue pair broker indicating that a queue * pair that was previously not ready, now either is ready or * gone forever. */ static int qp_wakeup_cb(void *client_data) { struct vmci_qp *qpair = (struct vmci_qp *)client_data; qp_lock(qpair); while (qpair->blocked > 0) { qpair->blocked--; qpair->generation++; wake_up(&qpair->event); } qp_unlock(qpair); return VMCI_SUCCESS; } /* * Makes the calling thread wait for the queue pair to become * ready for host side access. Returns true when thread is * woken up after queue pair state change, false otherwise. */ static bool qp_wait_for_ready_queue(struct vmci_qp *qpair) { unsigned int generation; qpair->blocked++; generation = qpair->generation; qp_unlock(qpair); wait_event(qpair->event, generation != qpair->generation); qp_lock(qpair); return true; } /* * Enqueues a given buffer to the produce queue using the provided * function. As many bytes as possible (space available in the queue) * are enqueued. Assumes the queue->mutex has been acquired. Returns * VMCI_ERROR_QUEUEPAIR_NOSPACE if no space was available to enqueue * data, VMCI_ERROR_INVALID_SIZE, if any queue pointer is outside the * queue (as defined by the queue size), VMCI_ERROR_INVALID_ARGS, if * an error occured when accessing the buffer, * VMCI_ERROR_QUEUEPAIR_NOTATTACHED, if the queue pair pages aren't * available. Otherwise, the number of bytes written to the queue is * returned. Updates the tail pointer of the produce queue. */ static ssize_t qp_enqueue_locked(struct vmci_queue *produce_q, struct vmci_queue *consume_q, const u64 produce_q_size, struct iov_iter *from) { s64 free_space; u64 tail; size_t buf_size = iov_iter_count(from); size_t written; ssize_t result; result = qp_map_queue_headers(produce_q, consume_q); if (unlikely(result != VMCI_SUCCESS)) return result; free_space = vmci_q_header_free_space(produce_q->q_header, consume_q->q_header, produce_q_size); if (free_space == 0) return VMCI_ERROR_QUEUEPAIR_NOSPACE; if (free_space < VMCI_SUCCESS) return (ssize_t) free_space; written = (size_t) (free_space > buf_size ? buf_size : free_space); tail = vmci_q_header_producer_tail(produce_q->q_header); if (likely(tail + written < produce_q_size)) { result = qp_memcpy_to_queue_iter(produce_q, tail, from, written); } else { /* Tail pointer wraps around. */ const size_t tmp = (size_t) (produce_q_size - tail); result = qp_memcpy_to_queue_iter(produce_q, tail, from, tmp); if (result >= VMCI_SUCCESS) result = qp_memcpy_to_queue_iter(produce_q, 0, from, written - tmp); } if (result < VMCI_SUCCESS) return result; /* * This virt_wmb() ensures that data written to the queue * is observable before the new producer_tail is. */ virt_wmb(); vmci_q_header_add_producer_tail(produce_q->q_header, written, produce_q_size); return written; } /* * Dequeues data (if available) from the given consume queue. Writes data * to the user provided buffer using the provided function. * Assumes the queue->mutex has been acquired. * Results: * VMCI_ERROR_QUEUEPAIR_NODATA if no data was available to dequeue. * VMCI_ERROR_INVALID_SIZE, if any queue pointer is outside the queue * (as defined by the queue size). * VMCI_ERROR_INVALID_ARGS, if an error occured when accessing the buffer. * Otherwise the number of bytes dequeued is returned. * Side effects: * Updates the head pointer of the consume queue. */ static ssize_t qp_dequeue_locked(struct vmci_queue *produce_q, struct vmci_queue *consume_q, const u64 consume_q_size, struct iov_iter *to, bool update_consumer) { size_t buf_size = iov_iter_count(to); s64 buf_ready; u64 head; size_t read; ssize_t result; result = qp_map_queue_headers(produce_q, consume_q); if (unlikely(result != VMCI_SUCCESS)) return result; buf_ready = vmci_q_header_buf_ready(consume_q->q_header, produce_q->q_header, consume_q_size); if (buf_ready == 0) return VMCI_ERROR_QUEUEPAIR_NODATA; if (buf_ready < VMCI_SUCCESS) return (ssize_t) buf_ready; /* * This virt_rmb() ensures that data from the queue will be read * after we have determined how much is ready to be consumed. */ virt_rmb(); read = (size_t) (buf_ready > buf_size ? buf_size : buf_ready); head = vmci_q_header_consumer_head(produce_q->q_header); if (likely(head + read < consume_q_size)) { result = qp_memcpy_from_queue_iter(to, consume_q, head, read); } else { /* Head pointer wraps around. */ const size_t tmp = (size_t) (consume_q_size - head); result = qp_memcpy_from_queue_iter(to, consume_q, head, tmp); if (result >= VMCI_SUCCESS) result = qp_memcpy_from_queue_iter(to, consume_q, 0, read - tmp); } if (result < VMCI_SUCCESS) return result; if (update_consumer) vmci_q_header_add_consumer_head(produce_q->q_header, read, consume_q_size); return read; } /* * vmci_qpair_alloc() - Allocates a queue pair. * @qpair: Pointer for the new vmci_qp struct. * @handle: Handle to track the resource. * @produce_qsize: Desired size of the producer queue. * @consume_qsize: Desired size of the consumer queue. * @peer: ContextID of the peer. * @flags: VMCI flags. * @priv_flags: VMCI priviledge flags. * * This is the client interface for allocating the memory for a * vmci_qp structure and then attaching to the underlying * queue. If an error occurs allocating the memory for the * vmci_qp structure no attempt is made to attach. If an * error occurs attaching, then the structure is freed. */ int vmci_qpair_alloc(struct vmci_qp **qpair, struct vmci_handle *handle, u64 produce_qsize, u64 consume_qsize, u32 peer, u32 flags, u32 priv_flags) { struct vmci_qp *my_qpair; int retval; struct vmci_handle src = VMCI_INVALID_HANDLE; struct vmci_handle dst = vmci_make_handle(peer, VMCI_INVALID_ID); enum vmci_route route; vmci_event_release_cb wakeup_cb; void *client_data; /* * Restrict the size of a queuepair. The device already * enforces a limit on the total amount of memory that can be * allocated to queuepairs for a guest. However, we try to * allocate this memory before we make the queuepair * allocation hypercall. On Linux, we allocate each page * separately, which means rather than fail, the guest will * thrash while it tries to allocate, and will become * increasingly unresponsive to the point where it appears to * be hung. So we place a limit on the size of an individual * queuepair here, and leave the device to enforce the * restriction on total queuepair memory. (Note that this * doesn't prevent all cases; a user with only this much * physical memory could still get into trouble.) The error * used by the device is NO_RESOURCES, so use that here too. */ if (!QP_SIZES_ARE_VALID(produce_qsize, consume_qsize)) return VMCI_ERROR_NO_RESOURCES; retval = vmci_route(&src, &dst, false, &route); if (retval < VMCI_SUCCESS) route = vmci_guest_code_active() ? VMCI_ROUTE_AS_GUEST : VMCI_ROUTE_AS_HOST; if (flags & (VMCI_QPFLAG_NONBLOCK | VMCI_QPFLAG_PINNED)) { pr_devel("NONBLOCK OR PINNED set"); return VMCI_ERROR_INVALID_ARGS; } my_qpair = kzalloc_obj(*my_qpair); if (!my_qpair) return VMCI_ERROR_NO_MEM; my_qpair->produce_q_size = produce_qsize; my_qpair->consume_q_size = consume_qsize; my_qpair->peer = peer; my_qpair->flags = flags; my_qpair->priv_flags = priv_flags; wakeup_cb = NULL; client_data = NULL; if (VMCI_ROUTE_AS_HOST == route) { my_qpair->guest_endpoint = false; if (!(flags & VMCI_QPFLAG_LOCAL)) { my_qpair->blocked = 0; my_qpair->generation = 0; init_waitqueue_head(&my_qpair->event); wakeup_cb = qp_wakeup_cb; client_data = (void *)my_qpair; } } else { my_qpair->guest_endpoint = true; } retval = vmci_qp_alloc(handle, &my_qpair->produce_q, my_qpair->produce_q_size, &my_qpair->consume_q, my_qpair->consume_q_size, my_qpair->peer, my_qpair->flags, my_qpair->priv_flags, my_qpair->guest_endpoint, wakeup_cb, client_data); if (retval < VMCI_SUCCESS) { kfree(my_qpair); return retval; } *qpair = my_qpair; my_qpair->handle = *handle; return retval; } EXPORT_SYMBOL_GPL(vmci_qpair_alloc); /* * vmci_qpair_detach() - Detatches the client from a queue pair. * @qpair: Reference of a pointer to the qpair struct. * * This is the client interface for detaching from a VMCIQPair. * Note that this routine will free the memory allocated for the * vmci_qp structure too. */ int vmci_qpair_detach(struct vmci_qp **qpair) { int result; struct vmci_qp *old_qpair; if (!qpair || !(*qpair)) return VMCI_ERROR_INVALID_ARGS; old_qpair = *qpair; result = qp_detatch(old_qpair->handle, old_qpair->guest_endpoint); /* * The guest can fail to detach for a number of reasons, and * if it does so, it will cleanup the entry (if there is one). * The host can fail too, but it won't cleanup the entry * immediately, it will do that later when the context is * freed. Either way, we need to release the qpair struct * here; there isn't much the caller can do, and we don't want * to leak. */ memset(old_qpair, 0, sizeof(*old_qpair)); old_qpair->handle = VMCI_INVALID_HANDLE; old_qpair->peer = VMCI_INVALID_ID; kfree(old_qpair); *qpair = NULL; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_detach); /* * vmci_qpair_get_produce_indexes() - Retrieves the indexes of the producer. * @qpair: Pointer to the queue pair struct. * @producer_tail: Reference used for storing producer tail index. * @consumer_head: Reference used for storing the consumer head index. * * This is the client interface for getting the current indexes of the * QPair from the point of the view of the caller as the producer. */ int vmci_qpair_get_produce_indexes(const struct vmci_qp *qpair, u64 *producer_tail, u64 *consumer_head) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; int result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) vmci_q_header_get_pointers(produce_q_header, consume_q_header, producer_tail, consumer_head); qp_unlock(qpair); if (result == VMCI_SUCCESS && ((producer_tail && *producer_tail >= qpair->produce_q_size) || (consumer_head && *consumer_head >= qpair->produce_q_size))) return VMCI_ERROR_INVALID_SIZE; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_get_produce_indexes); /* * vmci_qpair_get_consume_indexes() - Retrieves the indexes of the consumer. * @qpair: Pointer to the queue pair struct. * @consumer_tail: Reference used for storing consumer tail index. * @producer_head: Reference used for storing the producer head index. * * This is the client interface for getting the current indexes of the * QPair from the point of the view of the caller as the consumer. */ int vmci_qpair_get_consume_indexes(const struct vmci_qp *qpair, u64 *consumer_tail, u64 *producer_head) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; int result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) vmci_q_header_get_pointers(consume_q_header, produce_q_header, consumer_tail, producer_head); qp_unlock(qpair); if (result == VMCI_SUCCESS && ((consumer_tail && *consumer_tail >= qpair->consume_q_size) || (producer_head && *producer_head >= qpair->consume_q_size))) return VMCI_ERROR_INVALID_SIZE; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_get_consume_indexes); /* * vmci_qpair_produce_free_space() - Retrieves free space in producer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of free * space in the QPair from the point of the view of the caller as * the producer which is the common case. Returns < 0 if err, else * available bytes into which data can be enqueued if > 0. */ s64 vmci_qpair_produce_free_space(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_free_space(produce_q_header, consume_q_header, qpair->produce_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_produce_free_space); /* * vmci_qpair_consume_free_space() - Retrieves free space in consumer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of free * space in the QPair from the point of the view of the caller as * the consumer which is not the common case. Returns < 0 if err, else * available bytes into which data can be enqueued if > 0. */ s64 vmci_qpair_consume_free_space(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_free_space(consume_q_header, produce_q_header, qpair->consume_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_consume_free_space); /* * vmci_qpair_produce_buf_ready() - Gets bytes ready to read from * producer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of * enqueued data in the QPair from the point of the view of the * caller as the producer which is not the common case. Returns < 0 if err, * else available bytes that may be read. */ s64 vmci_qpair_produce_buf_ready(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_buf_ready(produce_q_header, consume_q_header, qpair->produce_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_produce_buf_ready); /* * vmci_qpair_consume_buf_ready() - Gets bytes ready to read from * consumer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of * enqueued data in the QPair from the point of the view of the * caller as the consumer which is the normal case. Returns < 0 if err, * else available bytes that may be read. */ s64 vmci_qpair_consume_buf_ready(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_buf_ready(consume_q_header, produce_q_header, qpair->consume_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_consume_buf_ready); /* * vmci_qpair_enquev() - Throw data on the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer containing data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for enqueueing data into the queue. * This function uses IO vectors to handle the work. Returns number * of bytes enqueued or < 0 on error. */ ssize_t vmci_qpair_enquev(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_enqueue_locked(qpair->produce_q, qpair->consume_q, qpair->produce_q_size, &msg->msg_iter); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_enquev); /* * vmci_qpair_dequev() - Get data from the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer for the data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for dequeueing data from the queue. * This function uses IO vectors to handle the work. Returns number * of bytes dequeued or < 0 on error. */ ssize_t vmci_qpair_dequev(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &msg->msg_iter, true); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_dequev); /* * vmci_qpair_peekv() - Peek at the data in the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer for the data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused on Linux). * * This is the client interface for peeking into a queue. (I.e., * copy data from the queue without updating the head pointer.) * This function uses IO vectors to handle the work. Returns number * of bytes peeked or < 0 on error. */ ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &msg->msg_iter, false); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_peekv); |
| 169 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * x86-optimized SHA-512 block function * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); #define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha512_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha512_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha512_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { static_call(sha512_blocks_x86)(state, data, nblocks); } #define sha512_mod_init_arch sha512_mod_init_arch static void sha512_mod_init_arch(void) { if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha512_blocks_x86, sha512_blocks_avx2); else static_call_update(sha512_blocks_x86, sha512_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); } } |
| 29 29 29 29 29 29 29 29 28 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 28 29 29 29 29 29 29 20 5 10 8 14 3 5 5 1 1 2 3 5 1 1 1 3 3 1 43 43 14 43 15 15 15 15 14 8 8 8 1 4 5 1 9 5 1 1 1 1 3 1 1 1 2 1 1 3 3 1 1 40 26 3 1 1 1 1 1 11 11 2 2 3 1 2 5 5 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 | // SPDX-License-Identifier: GPL-2.0+ /* * HID driver for UC-Logic devices not fully compliant with HID standard * - tablet initialization and parameter retrieval * * Copyright (c) 2018 Nikolai Kondrashov */ /* * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ #include "hid-uclogic-params.h" #include "hid-uclogic-rdesc.h" #include "usbhid/usbhid.h" #include "hid-ids.h" #include <linux/ctype.h> #include <linux/string.h> #include <linux/unaligned.h> #include <linux/string_choices.h> /** * uclogic_params_pen_inrange_to_str() - Convert a pen in-range reporting type * to a string. * @inrange: The in-range reporting type to convert. * * Return: * * The string representing the type, or * * %NULL if the type is unknown. */ static const char *uclogic_params_pen_inrange_to_str( enum uclogic_params_pen_inrange inrange) { switch (inrange) { case UCLOGIC_PARAMS_PEN_INRANGE_NORMAL: return "normal"; case UCLOGIC_PARAMS_PEN_INRANGE_INVERTED: return "inverted"; case UCLOGIC_PARAMS_PEN_INRANGE_NONE: return "none"; default: return NULL; } } /** * uclogic_params_pen_hid_dbg() - Dump tablet interface pen parameters * @hdev: The HID device the pen parameters describe. * @pen: The pen parameters to dump. * * Dump tablet interface pen parameters with hid_dbg(). The dump is indented * with a tab. */ static void uclogic_params_pen_hid_dbg(const struct hid_device *hdev, const struct uclogic_params_pen *pen) { size_t i; hid_dbg(hdev, "\t.usage_invalid = %s\n", str_true_false(pen->usage_invalid)); hid_dbg(hdev, "\t.desc_ptr = %p\n", pen->desc_ptr); hid_dbg(hdev, "\t.desc_size = %u\n", pen->desc_size); hid_dbg(hdev, "\t.id = %u\n", pen->id); hid_dbg(hdev, "\t.subreport_list = {\n"); for (i = 0; i < ARRAY_SIZE(pen->subreport_list); i++) { hid_dbg(hdev, "\t\t{0x%02hhx, %hhu}%s\n", pen->subreport_list[i].value, pen->subreport_list[i].id, i < (ARRAY_SIZE(pen->subreport_list) - 1) ? "," : ""); } hid_dbg(hdev, "\t}\n"); hid_dbg(hdev, "\t.inrange = %s\n", uclogic_params_pen_inrange_to_str(pen->inrange)); hid_dbg(hdev, "\t.fragmented_hires = %s\n", str_true_false(pen->fragmented_hires)); hid_dbg(hdev, "\t.tilt_y_flipped = %s\n", str_true_false(pen->tilt_y_flipped)); } /** * uclogic_params_frame_hid_dbg() - Dump tablet interface frame parameters * @hdev: The HID device the pen parameters describe. * @frame: The frame parameters to dump. * * Dump tablet interface frame parameters with hid_dbg(). The dump is * indented with two tabs. */ static void uclogic_params_frame_hid_dbg( const struct hid_device *hdev, const struct uclogic_params_frame *frame) { hid_dbg(hdev, "\t\t.desc_ptr = %p\n", frame->desc_ptr); hid_dbg(hdev, "\t\t.desc_size = %u\n", frame->desc_size); hid_dbg(hdev, "\t\t.id = %u\n", frame->id); hid_dbg(hdev, "\t\t.suffix = %s\n", frame->suffix); hid_dbg(hdev, "\t\t.re_lsb = %u\n", frame->re_lsb); hid_dbg(hdev, "\t\t.dev_id_byte = %u\n", frame->dev_id_byte); hid_dbg(hdev, "\t\t.touch_byte = %u\n", frame->touch_byte); hid_dbg(hdev, "\t\t.touch_max = %hhd\n", frame->touch_max); hid_dbg(hdev, "\t\t.touch_flip_at = %hhd\n", frame->touch_flip_at); hid_dbg(hdev, "\t\t.bitmap_dial_byte = %u\n", frame->bitmap_dial_byte); hid_dbg(hdev, "\t\t.bitmap_second_dial_destination_byte = %u\n", frame->bitmap_second_dial_destination_byte); } /** * uclogic_params_hid_dbg() - Dump tablet interface parameters * @hdev: The HID device the parameters describe. * @params: The parameters to dump. * * Dump tablet interface parameters with hid_dbg(). */ void uclogic_params_hid_dbg(const struct hid_device *hdev, const struct uclogic_params *params) { size_t i; hid_dbg(hdev, ".invalid = %s\n", str_true_false(params->invalid)); hid_dbg(hdev, ".desc_ptr = %p\n", params->desc_ptr); hid_dbg(hdev, ".desc_size = %u\n", params->desc_size); hid_dbg(hdev, ".pen = {\n"); uclogic_params_pen_hid_dbg(hdev, ¶ms->pen); hid_dbg(hdev, "\t}\n"); hid_dbg(hdev, ".frame_list = {\n"); for (i = 0; i < ARRAY_SIZE(params->frame_list); i++) { hid_dbg(hdev, "\t{\n"); uclogic_params_frame_hid_dbg(hdev, ¶ms->frame_list[i]); hid_dbg(hdev, "\t}%s\n", i < (ARRAY_SIZE(params->frame_list) - 1) ? "," : ""); } hid_dbg(hdev, "}\n"); } /** * uclogic_params_get_str_desc - retrieve a string descriptor from a HID * device interface, putting it into a kmalloc-allocated buffer as is, without * character encoding conversion. * * @pbuf: Location for the kmalloc-allocated buffer pointer containing * the retrieved descriptor. Not modified in case of error. * Can be NULL to have retrieved descriptor discarded. * @hdev: The HID device of the tablet interface to retrieve the string * descriptor from. Cannot be NULL. * @idx: Index of the string descriptor to request from the device. * @len: Length of the buffer to allocate and the data to retrieve. * * Returns: * number of bytes retrieved (<= len), * -EPIPE, if the descriptor was not found, or * another negative errno code in case of other error. */ static int uclogic_params_get_str_desc(__u8 **pbuf, struct hid_device *hdev, __u8 idx, size_t len) { int rc; struct usb_device *udev; __u8 *buf = NULL; /* Check arguments */ if (hdev == NULL) { rc = -EINVAL; goto cleanup; } udev = hid_to_usb_dev(hdev); buf = kmalloc(len, GFP_KERNEL); if (buf == NULL) { rc = -ENOMEM; goto cleanup; } rc = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, (USB_DT_STRING << 8) + idx, 0x0409, buf, len, USB_CTRL_GET_TIMEOUT); if (rc == -EPIPE) { hid_dbg(hdev, "string descriptor #%hhu not found\n", idx); goto cleanup; } else if (rc < 0) { hid_err(hdev, "failed retrieving string descriptor #%u: %d\n", idx, rc); goto cleanup; } if (pbuf != NULL) { *pbuf = buf; buf = NULL; } cleanup: kfree(buf); return rc; } /** * uclogic_params_pen_cleanup - free resources used by struct * uclogic_params_pen (tablet interface's pen input parameters). * Can be called repeatedly. * * @pen: Pen input parameters to cleanup. Cannot be NULL. */ static void uclogic_params_pen_cleanup(struct uclogic_params_pen *pen) { kfree(pen->desc_ptr); memset(pen, 0, sizeof(*pen)); } /** * uclogic_params_pen_init_v1() - initialize tablet interface pen * input and retrieve its parameters from the device, using v1 protocol. * * @pen: Pointer to the pen parameters to initialize (to be * cleaned up with uclogic_params_pen_cleanup()). Not modified in * case of error, or if parameters are not found. Cannot be NULL. * @pfound: Location for a flag which is set to true if the parameters * were found, and to false if not (e.g. device was * incompatible). Not modified in case of error. Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_pen_init_v1(struct uclogic_params_pen *pen, bool *pfound, struct hid_device *hdev) { int rc; bool found = false; /* Buffer for (part of) the string descriptor */ __u8 *buf = NULL; /* Minimum descriptor length required, maximum seen so far is 18 */ const int len = 12; s32 resolution; /* Pen report descriptor template parameters */ s32 desc_params[UCLOGIC_RDESC_PH_ID_NUM]; __u8 *desc_ptr = NULL; /* Check arguments */ if (pen == NULL || pfound == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } /* * Read string descriptor containing pen input parameters. * The specific string descriptor and data were discovered by sniffing * the Windows driver traffic. * NOTE: This enables fully-functional tablet mode. */ rc = uclogic_params_get_str_desc(&buf, hdev, 100, len); if (rc == -EPIPE) { hid_dbg(hdev, "string descriptor with pen parameters not found, assuming not compatible\n"); goto finish; } else if (rc < 0) { hid_err(hdev, "failed retrieving pen parameters: %d\n", rc); goto cleanup; } else if (rc != len) { hid_dbg(hdev, "string descriptor with pen parameters has invalid length (got %d, expected %d), assuming not compatible\n", rc, len); goto finish; } /* * Fill report descriptor parameters from the string descriptor */ desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] = get_unaligned_le16(buf + 2); desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] = get_unaligned_le16(buf + 4); desc_params[UCLOGIC_RDESC_PEN_PH_ID_PRESSURE_LM] = get_unaligned_le16(buf + 8); resolution = get_unaligned_le16(buf + 10); if (resolution == 0) { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = 0; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = 0; } else { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] * 1000 / resolution; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] * 1000 / resolution; } kfree(buf); buf = NULL; /* * Generate pen report descriptor */ desc_ptr = uclogic_rdesc_template_apply( uclogic_rdesc_v1_pen_template_arr, uclogic_rdesc_v1_pen_template_size, desc_params, ARRAY_SIZE(desc_params)); if (desc_ptr == NULL) { rc = -ENOMEM; goto cleanup; } /* * Fill-in the parameters */ memset(pen, 0, sizeof(*pen)); pen->desc_ptr = desc_ptr; desc_ptr = NULL; pen->desc_size = uclogic_rdesc_v1_pen_template_size; pen->id = UCLOGIC_RDESC_V1_PEN_ID; pen->inrange = UCLOGIC_PARAMS_PEN_INRANGE_INVERTED; found = true; finish: *pfound = found; rc = 0; cleanup: kfree(desc_ptr); kfree(buf); return rc; } /** * uclogic_params_get_le24() - get a 24-bit little-endian number from a * buffer. * * @p: The pointer to the number buffer. * * Returns: * The retrieved number */ static s32 uclogic_params_get_le24(const void *p) { const __u8 *b = p; return b[0] | (b[1] << 8UL) | (b[2] << 16UL); } /** * uclogic_params_pen_init_v2() - initialize tablet interface pen * input and retrieve its parameters from the device, using v2 protocol. * * @pen: Pointer to the pen parameters to initialize (to be * cleaned up with uclogic_params_pen_cleanup()). Not * modified in case of error, or if parameters are not * found. Cannot be NULL. * @pfound: Location for a flag which is set to true if the * parameters were found, and to false if not (e.g. * device was incompatible). Not modified in case of * error. Cannot be NULL. * @pparams_ptr: Location for a kmalloc'ed pointer to the retrieved raw * parameters, which could be used to identify the tablet * to some extent. Should be freed with kfree after use. * NULL, if not needed. Not modified in case of error. * Only set if *pfound is set to true. * @pparams_len: Location for the length of the retrieved raw * parameters. NULL, if not needed. Not modified in case * of error. Only set if *pfound is set to true. * @hdev: The HID device of the tablet interface to initialize * and get parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_pen_init_v2(struct uclogic_params_pen *pen, bool *pfound, __u8 **pparams_ptr, size_t *pparams_len, struct hid_device *hdev) { int rc; bool found = false; /* Buffer for (part of) the parameter string descriptor */ __u8 *buf = NULL; /* Parameter string descriptor required length */ const int params_len_min = 18; /* Parameter string descriptor accepted length */ const int params_len_max = 32; /* Parameter string descriptor received length */ int params_len; size_t i; s32 resolution; /* Pen report descriptor template parameters */ s32 desc_params[UCLOGIC_RDESC_PH_ID_NUM]; __u8 *desc_ptr = NULL; /* Check arguments */ if (pen == NULL || pfound == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } /* * Read string descriptor containing pen input parameters. * The specific string descriptor and data were discovered by sniffing * the Windows driver traffic. * NOTE: This enables fully-functional tablet mode. */ rc = uclogic_params_get_str_desc(&buf, hdev, 200, params_len_max); if (rc == -EPIPE) { hid_dbg(hdev, "string descriptor with pen parameters not found, assuming not compatible\n"); goto finish; } else if (rc < 0) { hid_err(hdev, "failed retrieving pen parameters: %d\n", rc); goto cleanup; } else if (rc < params_len_min) { hid_dbg(hdev, "string descriptor with pen parameters is too short (got %d, expected at least %d), assuming not compatible\n", rc, params_len_min); goto finish; } params_len = rc; /* * Check it's not just a catch-all UTF-16LE-encoded ASCII * string (such as the model name) some tablets put into all * unknown string descriptors. */ for (i = 2; i < params_len && (buf[i] >= 0x20 && buf[i] < 0x7f && buf[i + 1] == 0); i += 2); if (i >= params_len) { hid_dbg(hdev, "string descriptor with pen parameters seems to contain only text, assuming not compatible\n"); goto finish; } /* * Fill report descriptor parameters from the string descriptor */ desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] = uclogic_params_get_le24(buf + 2); desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] = uclogic_params_get_le24(buf + 5); desc_params[UCLOGIC_RDESC_PEN_PH_ID_PRESSURE_LM] = get_unaligned_le16(buf + 8); resolution = get_unaligned_le16(buf + 10); if (resolution == 0) { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = 0; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = 0; } else { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] * 1000 / resolution; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] * 1000 / resolution; } /* * Generate pen report descriptor */ desc_ptr = uclogic_rdesc_template_apply( uclogic_rdesc_v2_pen_template_arr, uclogic_rdesc_v2_pen_template_size, desc_params, ARRAY_SIZE(desc_params)); if (desc_ptr == NULL) { rc = -ENOMEM; goto cleanup; } /* * Fill-in the parameters */ memset(pen, 0, sizeof(*pen)); pen->desc_ptr = desc_ptr; desc_ptr = NULL; pen->desc_size = uclogic_rdesc_v2_pen_template_size; pen->id = UCLOGIC_RDESC_V2_PEN_ID; pen->inrange = UCLOGIC_PARAMS_PEN_INRANGE_NONE; pen->fragmented_hires = true; pen->tilt_y_flipped = true; found = true; if (pparams_ptr != NULL) { *pparams_ptr = buf; buf = NULL; } if (pparams_len != NULL) *pparams_len = params_len; finish: *pfound = found; rc = 0; cleanup: kfree(desc_ptr); kfree(buf); return rc; } /** * uclogic_params_frame_cleanup - free resources used by struct * uclogic_params_frame (tablet interface's frame controls input parameters). * Can be called repeatedly. * * @frame: Frame controls input parameters to cleanup. Cannot be NULL. */ static void uclogic_params_frame_cleanup(struct uclogic_params_frame *frame) { kfree(frame->desc_ptr); memset(frame, 0, sizeof(*frame)); } /** * uclogic_params_frame_init_with_desc() - initialize tablet's frame control * parameters with a static report descriptor. * * @frame: Pointer to the frame parameters to initialize (to be cleaned * up with uclogic_params_frame_cleanup()). Not modified in case * of error. Cannot be NULL. * @desc_ptr: Report descriptor pointer. Can be NULL, if desc_size is zero. * @desc_size: Report descriptor size. * @id: Report ID used for frame reports, if they should be tweaked, * zero if not. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_frame_init_with_desc( struct uclogic_params_frame *frame, const __u8 *desc_ptr, size_t desc_size, unsigned int id) { __u8 *copy_desc_ptr; if (frame == NULL || (desc_ptr == NULL && desc_size != 0)) return -EINVAL; copy_desc_ptr = kmemdup(desc_ptr, desc_size, GFP_KERNEL); if (copy_desc_ptr == NULL) return -ENOMEM; memset(frame, 0, sizeof(*frame)); frame->desc_ptr = copy_desc_ptr; frame->desc_size = desc_size; frame->id = id; return 0; } /** * uclogic_params_frame_init_v1() - initialize v1 tablet interface frame * controls. * * @frame: Pointer to the frame parameters to initialize (to be cleaned * up with uclogic_params_frame_cleanup()). Not modified in case * of error, or if parameters are not found. Cannot be NULL. * @pfound: Location for a flag which is set to true if the parameters * were found, and to false if not (e.g. device was * incompatible). Not modified in case of error. Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_frame_init_v1(struct uclogic_params_frame *frame, bool *pfound, struct hid_device *hdev) { int rc; bool found = false; struct usb_device *usb_dev; char *str_buf = NULL; const size_t str_len = 16; /* Check arguments */ if (frame == NULL || pfound == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } usb_dev = hid_to_usb_dev(hdev); /* * Enable generic button mode */ str_buf = kzalloc(str_len, GFP_KERNEL); if (str_buf == NULL) { rc = -ENOMEM; goto cleanup; } rc = usb_string(usb_dev, 123, str_buf, str_len); if (rc == -EPIPE) { hid_dbg(hdev, "generic button -enabling string descriptor not found\n"); } else if (rc < 0) { goto cleanup; } else if (strncmp(str_buf, "HK On", rc) != 0) { hid_dbg(hdev, "invalid response to enabling generic buttons: \"%s\"\n", str_buf); } else { hid_dbg(hdev, "generic buttons enabled\n"); rc = uclogic_params_frame_init_with_desc( frame, uclogic_rdesc_v1_frame_arr, uclogic_rdesc_v1_frame_size, UCLOGIC_RDESC_V1_FRAME_ID); if (rc != 0) goto cleanup; found = true; } *pfound = found; rc = 0; cleanup: kfree(str_buf); return rc; } /** * uclogic_params_cleanup_event_hooks - free resources used by the list of raw * event hooks. * Can be called repeatedly. * * @params: Input parameters to cleanup. Cannot be NULL. */ static void uclogic_params_cleanup_event_hooks(struct uclogic_params *params) { struct uclogic_raw_event_hook *curr, *n; if (!params || !params->event_hooks) return; list_for_each_entry_safe(curr, n, ¶ms->event_hooks->list, list) { cancel_work_sync(&curr->work); list_del(&curr->list); kfree(curr->event); kfree(curr); } kfree(params->event_hooks); params->event_hooks = NULL; } /** * uclogic_params_cleanup - free resources used by struct uclogic_params * (tablet interface's parameters). * Can be called repeatedly. * * @params: Input parameters to cleanup. Cannot be NULL. */ void uclogic_params_cleanup(struct uclogic_params *params) { if (!params->invalid) { size_t i; kfree(params->desc_ptr); uclogic_params_pen_cleanup(¶ms->pen); for (i = 0; i < ARRAY_SIZE(params->frame_list); i++) uclogic_params_frame_cleanup(¶ms->frame_list[i]); uclogic_params_cleanup_event_hooks(params); memset(params, 0, sizeof(*params)); } } /** * uclogic_params_get_desc() - Get a replacement report descriptor for a * tablet's interface. * * @params: The parameters of a tablet interface to get report * descriptor for. Cannot be NULL. * @pdesc: Location for the resulting, kmalloc-allocated report * descriptor pointer, or for NULL, if there's no replacement * report descriptor. Not modified in case of error. Cannot be * NULL. * @psize: Location for the resulting report descriptor size, not set if * there's no replacement report descriptor. Not modified in case * of error. Cannot be NULL. * * Returns: * Zero, if successful. * -EINVAL, if invalid arguments are supplied. * -ENOMEM, if failed to allocate memory. */ int uclogic_params_get_desc(const struct uclogic_params *params, const __u8 **pdesc, unsigned int *psize) { int rc = -ENOMEM; bool present = false; unsigned int size = 0; __u8 *desc = NULL; size_t i; /* Check arguments */ if (params == NULL || pdesc == NULL || psize == NULL) return -EINVAL; /* Concatenate descriptors */ #define ADD_DESC(_desc_ptr, _desc_size) \ do { \ unsigned int new_size; \ __u8 *new_desc; \ if ((_desc_ptr) == NULL) { \ break; \ } \ new_size = size + (_desc_size); \ new_desc = krealloc(desc, new_size, GFP_KERNEL); \ if (new_desc == NULL) { \ goto cleanup; \ } \ memcpy(new_desc + size, (_desc_ptr), (_desc_size)); \ desc = new_desc; \ size = new_size; \ present = true; \ } while (0) ADD_DESC(params->desc_ptr, params->desc_size); ADD_DESC(params->pen.desc_ptr, params->pen.desc_size); for (i = 0; i < ARRAY_SIZE(params->frame_list); i++) { ADD_DESC(params->frame_list[i].desc_ptr, params->frame_list[i].desc_size); } #undef ADD_DESC if (present) { *pdesc = desc; *psize = size; desc = NULL; } rc = 0; cleanup: kfree(desc); return rc; } /** * uclogic_params_init_invalid() - initialize tablet interface parameters, * specifying the interface is invalid. * * @params: Parameters to initialize (to be cleaned with * uclogic_params_cleanup()). Cannot be NULL. */ static void uclogic_params_init_invalid(struct uclogic_params *params) { params->invalid = true; } /** * uclogic_params_init_with_opt_desc() - initialize tablet interface * parameters with an optional replacement report descriptor. Only modify * report descriptor, if the original report descriptor matches the expected * size. * * @params: Parameters to initialize (to be cleaned with * uclogic_params_cleanup()). Not modified in case of * error. Cannot be NULL. * @hdev: The HID device of the tablet interface create the * parameters for. Cannot be NULL. * @orig_desc_size: Expected size of the original report descriptor to * be replaced. * @desc_ptr: Pointer to the replacement report descriptor. * Can be NULL, if desc_size is zero. * @desc_size: Size of the replacement report descriptor. * * Returns: * Zero, if successful. -EINVAL if an invalid argument was passed. * -ENOMEM, if failed to allocate memory. */ static int uclogic_params_init_with_opt_desc(struct uclogic_params *params, struct hid_device *hdev, unsigned int orig_desc_size, const __u8 *desc_ptr, unsigned int desc_size) { __u8 *desc_copy_ptr = NULL; unsigned int desc_copy_size; int rc; /* Check arguments */ if (params == NULL || hdev == NULL || (desc_ptr == NULL && desc_size != 0)) { rc = -EINVAL; goto cleanup; } /* Replace report descriptor, if it matches */ if (hdev->dev_rsize == orig_desc_size) { hid_dbg(hdev, "device report descriptor matches the expected size, replacing\n"); desc_copy_ptr = kmemdup(desc_ptr, desc_size, GFP_KERNEL); if (desc_copy_ptr == NULL) { rc = -ENOMEM; goto cleanup; } desc_copy_size = desc_size; } else { hid_dbg(hdev, "device report descriptor doesn't match the expected size (%u != %u), preserving\n", hdev->dev_rsize, orig_desc_size); desc_copy_ptr = NULL; desc_copy_size = 0; } /* Output parameters */ memset(params, 0, sizeof(*params)); params->desc_ptr = desc_copy_ptr; desc_copy_ptr = NULL; params->desc_size = desc_copy_size; rc = 0; cleanup: kfree(desc_copy_ptr); return rc; } /** * uclogic_params_huion_init() - initialize a Huion tablet interface and discover * its parameters. * * @params: Parameters to fill in (to be cleaned with * uclogic_params_cleanup()). Not modified in case of error. * Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_huion_init(struct uclogic_params *params, struct hid_device *hdev) { int rc; struct usb_device *udev; struct usb_interface *iface; __u8 bInterfaceNumber; bool found; /* The resulting parameters (noop) */ struct uclogic_params p = {0, }; static const char transition_ver[] = "HUION_T153_160607"; char *ver_ptr = NULL; const size_t ver_len = sizeof(transition_ver) + 1; __u8 *params_ptr = NULL; size_t params_len = 0; /* Parameters string descriptor of a model with touch ring (HS610) */ static const __u8 touch_ring_model_params_buf[] = { 0x13, 0x03, 0x70, 0xC6, 0x00, 0x06, 0x7C, 0x00, 0xFF, 0x1F, 0xD8, 0x13, 0x03, 0x0D, 0x10, 0x01, 0x04, 0x3C, 0x3E }; /* Check arguments */ if (params == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } udev = hid_to_usb_dev(hdev); iface = to_usb_interface(hdev->dev.parent); bInterfaceNumber = iface->cur_altsetting->desc.bInterfaceNumber; /* If it's a custom keyboard interface */ if (bInterfaceNumber == 1) { /* Keep everything intact, but mark pen usage invalid */ p.pen.usage_invalid = true; goto output; /* Else, if it's not a pen interface */ } else if (bInterfaceNumber != 0) { uclogic_params_init_invalid(&p); goto output; } /* Try to get firmware version */ ver_ptr = kzalloc(ver_len, GFP_KERNEL); if (ver_ptr == NULL) { rc = -ENOMEM; goto cleanup; } rc = usb_string(udev, 201, ver_ptr, ver_len); if (rc == -EPIPE) { *ver_ptr = '\0'; } else if (rc < 0) { hid_err(hdev, "failed retrieving Huion firmware version: %d\n", rc); goto cleanup; } /* The firmware is used in userspace as unique identifier */ strscpy(hdev->uniq, ver_ptr, sizeof(hdev->uniq)); /* If this is a transition firmware */ if (strcmp(ver_ptr, transition_ver) == 0) { hid_dbg(hdev, "transition firmware detected, not probing pen v2 parameters\n"); } else { /* Try to probe v2 pen parameters */ rc = uclogic_params_pen_init_v2(&p.pen, &found, ¶ms_ptr, ¶ms_len, hdev); if (rc != 0) { hid_err(hdev, "failed probing pen v2 parameters: %d\n", rc); goto cleanup; } else if (found) { hid_dbg(hdev, "pen v2 parameters found\n"); /* Create v2 frame button parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_v2_frame_buttons_arr, uclogic_rdesc_v2_frame_buttons_size, UCLOGIC_RDESC_V2_FRAME_BUTTONS_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame button parameters: %d\n", rc); goto cleanup; } /* Link from pen sub-report */ p.pen.subreport_list[0].value = 0xe0; p.pen.subreport_list[0].id = UCLOGIC_RDESC_V2_FRAME_BUTTONS_ID; /* If this is the model with touch ring */ if (params_ptr != NULL && params_len == sizeof(touch_ring_model_params_buf) && memcmp(params_ptr, touch_ring_model_params_buf, params_len) == 0) { /* Create touch ring parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[1], uclogic_rdesc_v2_frame_touch_ring_arr, uclogic_rdesc_v2_frame_touch_ring_size, UCLOGIC_RDESC_V2_FRAME_TOUCH_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame touch ring parameters: %d\n", rc); goto cleanup; } p.frame_list[1].suffix = "Touch Ring"; p.frame_list[1].dev_id_byte = UCLOGIC_RDESC_V2_FRAME_TOUCH_DEV_ID_BYTE; p.frame_list[1].touch_byte = 5; p.frame_list[1].touch_max = 12; p.frame_list[1].touch_flip_at = 7; } else { /* Create touch strip parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[1], uclogic_rdesc_v2_frame_touch_strip_arr, uclogic_rdesc_v2_frame_touch_strip_size, UCLOGIC_RDESC_V2_FRAME_TOUCH_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame touch strip parameters: %d\n", rc); goto cleanup; } p.frame_list[1].suffix = "Touch Strip"; p.frame_list[1].dev_id_byte = UCLOGIC_RDESC_V2_FRAME_TOUCH_DEV_ID_BYTE; p.frame_list[1].touch_byte = 5; p.frame_list[1].touch_max = 8; } /* Link from pen sub-report */ p.pen.subreport_list[1].value = 0xf0; p.pen.subreport_list[1].id = UCLOGIC_RDESC_V2_FRAME_TOUCH_ID; /* Create v2 frame dial parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[2], uclogic_rdesc_v2_frame_dial_arr, uclogic_rdesc_v2_frame_dial_size, UCLOGIC_RDESC_V2_FRAME_DIAL_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame dial parameters: %d\n", rc); goto cleanup; } p.frame_list[2].suffix = "Dial"; p.frame_list[2].dev_id_byte = UCLOGIC_RDESC_V2_FRAME_DIAL_DEV_ID_BYTE; p.frame_list[2].bitmap_dial_byte = 5; /* Link from pen sub-report */ p.pen.subreport_list[2].value = 0xf1; p.pen.subreport_list[2].id = UCLOGIC_RDESC_V2_FRAME_DIAL_ID; goto output; } hid_dbg(hdev, "pen v2 parameters not found\n"); } /* Try to probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "failed probing pen v1 parameters: %d\n", rc); goto cleanup; } else if (found) { hid_dbg(hdev, "pen v1 parameters found\n"); /* Try to probe v1 frame */ rc = uclogic_params_frame_init_v1(&p.frame_list[0], &found, hdev); if (rc != 0) { hid_err(hdev, "v1 frame probing failed: %d\n", rc); goto cleanup; } hid_dbg(hdev, "frame v1 parameters%s found\n", (found ? "" : " not")); if (found) { /* Link frame button subreports from pen reports */ p.pen.subreport_list[0].value = 0xe0; p.pen.subreport_list[0].id = UCLOGIC_RDESC_V1_FRAME_ID; } goto output; } hid_dbg(hdev, "pen v1 parameters not found\n"); uclogic_params_init_invalid(&p); output: /* Output parameters */ memcpy(params, &p, sizeof(*params)); memset(&p, 0, sizeof(p)); rc = 0; cleanup: kfree(params_ptr); kfree(ver_ptr); uclogic_params_cleanup(&p); return rc; } /** * uclogic_probe_interface() - some tablets, like the Parblo A610 PLUS V2 or * the XP-PEN Deco Mini 7, need to be initialized by sending them magic data. * * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * @magic_arr: The magic data that should be sent to probe the interface. * Cannot be NULL. * @magic_size: Size of the magic data. * @endpoint: Endpoint where the magic data should be sent. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_probe_interface(struct hid_device *hdev, const u8 *magic_arr, size_t magic_size, int endpoint) { struct usb_device *udev; unsigned int pipe = 0; int sent; u8 *buf = NULL; int rc = 0; if (!hdev || !magic_arr) { rc = -EINVAL; goto cleanup; } buf = kmemdup(magic_arr, magic_size, GFP_KERNEL); if (!buf) { rc = -ENOMEM; goto cleanup; } udev = hid_to_usb_dev(hdev); pipe = usb_sndintpipe(udev, endpoint); rc = usb_interrupt_msg(udev, pipe, buf, magic_size, &sent, 1000); if (rc || sent != magic_size) { hid_err(hdev, "Interface probing failed: %d\n", rc); rc = -1; goto cleanup; } rc = 0; cleanup: kfree(buf); return rc; } /** * uclogic_params_parse_ugee_v2_desc - parse the string descriptor containing * pen and frame parameters returned by UGEE v2 devices. * * @str_desc: String descriptor, cannot be NULL. * @str_desc_size: Size of the string descriptor. * @desc_params: Output description params list. * @desc_params_size: Size of the output description params list. * @frame_type: Output frame type. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_parse_ugee_v2_desc(const __u8 *str_desc, size_t str_desc_size, s32 *desc_params, size_t desc_params_size, enum uclogic_params_frame_type *frame_type) { s32 pen_x_lm, pen_y_lm; s32 pen_x_pm, pen_y_pm; s32 pen_pressure_lm; s32 frame_num_buttons; s32 resolution; /* Minimum descriptor length required, maximum seen so far is 14 */ const int min_str_desc_size = 12; if (!str_desc || str_desc_size < min_str_desc_size) return -EINVAL; if (desc_params_size != UCLOGIC_RDESC_PH_ID_NUM) return -EINVAL; pen_x_lm = get_unaligned_le16(str_desc + 2); if (str_desc_size > 12) pen_x_lm += (u8)str_desc[12] << 16; pen_y_lm = get_unaligned_le16(str_desc + 4); frame_num_buttons = str_desc[6]; *frame_type = str_desc[7]; pen_pressure_lm = get_unaligned_le16(str_desc + 8); resolution = get_unaligned_le16(str_desc + 10); if (resolution == 0) { pen_x_pm = 0; pen_y_pm = 0; } else { pen_x_pm = pen_x_lm * 1000 / resolution; pen_y_pm = pen_y_lm * 1000 / resolution; } desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] = pen_x_lm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = pen_x_pm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] = pen_y_lm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = pen_y_pm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_PRESSURE_LM] = pen_pressure_lm; desc_params[UCLOGIC_RDESC_FRAME_PH_ID_UM] = frame_num_buttons; return 0; } /** * uclogic_params_ugee_v2_init_frame_buttons() - initialize a UGEE v2 frame with * buttons. * @p: Parameters to fill in, cannot be NULL. * @desc_params: Device description params list. * @desc_params_size: Size of the description params list. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_frame_buttons(struct uclogic_params *p, const s32 *desc_params, size_t desc_params_size) { __u8 *rdesc_frame = NULL; int rc = 0; if (!p || desc_params_size != UCLOGIC_RDESC_PH_ID_NUM) return -EINVAL; rdesc_frame = uclogic_rdesc_template_apply( uclogic_rdesc_ugee_v2_frame_btn_template_arr, uclogic_rdesc_ugee_v2_frame_btn_template_size, desc_params, UCLOGIC_RDESC_PH_ID_NUM); if (!rdesc_frame) return -ENOMEM; rc = uclogic_params_frame_init_with_desc(&p->frame_list[0], rdesc_frame, uclogic_rdesc_ugee_v2_frame_btn_template_size, UCLOGIC_RDESC_V1_FRAME_ID); kfree(rdesc_frame); return rc; } /** * uclogic_params_ugee_v2_init_frame_dial() - initialize a UGEE v2 frame with a * bitmap dial. * @p: Parameters to fill in, cannot be NULL. * @desc_params: Device description params list. * @desc_params_size: Size of the description params list. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_frame_dial(struct uclogic_params *p, const s32 *desc_params, size_t desc_params_size) { __u8 *rdesc_frame = NULL; int rc = 0; if (!p || desc_params_size != UCLOGIC_RDESC_PH_ID_NUM) return -EINVAL; rdesc_frame = uclogic_rdesc_template_apply( uclogic_rdesc_ugee_v2_frame_dial_template_arr, uclogic_rdesc_ugee_v2_frame_dial_template_size, desc_params, UCLOGIC_RDESC_PH_ID_NUM); if (!rdesc_frame) return -ENOMEM; rc = uclogic_params_frame_init_with_desc(&p->frame_list[0], rdesc_frame, uclogic_rdesc_ugee_v2_frame_dial_template_size, UCLOGIC_RDESC_V1_FRAME_ID); kfree(rdesc_frame); if (rc) return rc; p->frame_list[0].bitmap_dial_byte = 7; return 0; } /** * uclogic_params_ugee_v2_init_frame_mouse() - initialize a UGEE v2 frame with a * mouse. * @p: Parameters to fill in, cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_frame_mouse(struct uclogic_params *p) { int rc = 0; if (!p) return -EINVAL; rc = uclogic_params_frame_init_with_desc(&p->frame_list[1], uclogic_rdesc_ugee_v2_frame_mouse_template_arr, uclogic_rdesc_ugee_v2_frame_mouse_template_size, UCLOGIC_RDESC_V1_FRAME_ID); return rc; } /** * uclogic_params_ugee_v2_has_battery() - check whether a UGEE v2 device has * battery or not. * @hdev: The HID device of the tablet interface. * * Returns: * True if the device has battery, false otherwise. */ static bool uclogic_params_ugee_v2_has_battery(struct hid_device *hdev) { struct uclogic_drvdata *drvdata = hid_get_drvdata(hdev); if (drvdata->quirks & UCLOGIC_BATTERY_QUIRK) return true; /* The XP-PEN Deco LW vendor, product and version are identical to the * Deco L. The only difference reported by their firmware is the product * name. Add a quirk to support battery reporting on the wireless * version. */ if (hdev->vendor == USB_VENDOR_ID_UGEE && hdev->product == USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_L) { struct usb_device *udev = hid_to_usb_dev(hdev); if (strstarts(udev->product, "Deco LW")) return true; } return false; } /** * uclogic_params_ugee_v2_init_battery() - initialize UGEE v2 battery reporting. * @hdev: The HID device of the tablet interface, cannot be NULL. * @p: Parameters to fill in, cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_battery(struct hid_device *hdev, struct uclogic_params *p) { int rc = 0; if (!hdev || !p) return -EINVAL; /* Some tablets contain invalid characters in hdev->uniq, throwing a * "hwmon: '<name>' is not a valid name attribute, please fix" error. * Use the device vendor and product IDs instead. */ snprintf(hdev->uniq, sizeof(hdev->uniq), "%x-%x", hdev->vendor, hdev->product); rc = uclogic_params_frame_init_with_desc(&p->frame_list[1], uclogic_rdesc_ugee_v2_battery_template_arr, uclogic_rdesc_ugee_v2_battery_template_size, UCLOGIC_RDESC_UGEE_V2_BATTERY_ID); if (rc) return rc; p->frame_list[1].suffix = "Battery"; p->pen.subreport_list[1].value = 0xf2; p->pen.subreport_list[1].id = UCLOGIC_RDESC_UGEE_V2_BATTERY_ID; return rc; } /** * uclogic_params_ugee_v2_reconnect_work() - When a wireless tablet looses * connection to the USB dongle and reconnects, either because of its physical * distance or because it was switches off and on using the frame's switch, * uclogic_probe_interface() needs to be called again to enable the tablet. * * @work: The work that triggered this function. */ static void uclogic_params_ugee_v2_reconnect_work(struct work_struct *work) { struct uclogic_raw_event_hook *event_hook; event_hook = container_of(work, struct uclogic_raw_event_hook, work); uclogic_probe_interface(event_hook->hdev, uclogic_ugee_v2_probe_arr, uclogic_ugee_v2_probe_size, uclogic_ugee_v2_probe_endpoint); } /** * uclogic_params_ugee_v2_init_event_hooks() - initialize the list of events * to be hooked for UGEE v2 devices. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. * @p: Parameters to fill in, cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_event_hooks(struct hid_device *hdev, struct uclogic_params *p) { struct uclogic_raw_event_hook *event_hook; static const __u8 reconnect_event[] = { /* Event received on wireless tablet reconnection */ 0x02, 0xF8, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; if (!p) return -EINVAL; /* The reconnection event is only received if the tablet has battery */ if (!uclogic_params_ugee_v2_has_battery(hdev)) return 0; p->event_hooks = kzalloc_obj(*p->event_hooks); if (!p->event_hooks) return -ENOMEM; INIT_LIST_HEAD(&p->event_hooks->list); event_hook = kzalloc_obj(*event_hook); if (!event_hook) return -ENOMEM; INIT_WORK(&event_hook->work, uclogic_params_ugee_v2_reconnect_work); event_hook->hdev = hdev; event_hook->size = ARRAY_SIZE(reconnect_event); event_hook->event = kmemdup(reconnect_event, event_hook->size, GFP_KERNEL); if (!event_hook->event) { kfree(event_hook); return -ENOMEM; } list_add_tail(&event_hook->list, &p->event_hooks->list); return 0; } /** * uclogic_params_ugee_v2_init() - initialize a UGEE graphics tablets by * discovering their parameters. * * These tables, internally designed as v2 to differentiate them from older * models, expect a payload of magic data in orther to be switched to the fully * functional mode and expose their parameters in a similar way to the * information present in uclogic_params_pen_init_v1() but with some * differences. * * @params: Parameters to fill in (to be cleaned with * uclogic_params_cleanup()). Not modified in case of error. * Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init(struct uclogic_params *params, struct hid_device *hdev) { int rc = 0; struct uclogic_drvdata *drvdata; struct usb_interface *iface; __u8 bInterfaceNumber; const int str_desc_len = 12; __u8 *str_desc = NULL; __u8 *rdesc_pen = NULL; s32 desc_params[UCLOGIC_RDESC_PH_ID_NUM]; enum uclogic_params_frame_type frame_type; /* The resulting parameters (noop) */ struct uclogic_params p = {0, }; if (!params || !hdev) { rc = -EINVAL; goto cleanup; } drvdata = hid_get_drvdata(hdev); iface = to_usb_interface(hdev->dev.parent); bInterfaceNumber = iface->cur_altsetting->desc.bInterfaceNumber; if (bInterfaceNumber == 0) { rc = uclogic_params_ugee_v2_init_frame_mouse(&p); if (rc) goto cleanup; goto output; } if (bInterfaceNumber != 2) { uclogic_params_init_invalid(&p); goto output; } /* * Initialize the interface by sending magic data. * The specific data was discovered by sniffing the Windows driver * traffic. */ rc = uclogic_probe_interface(hdev, uclogic_ugee_v2_probe_arr, uclogic_ugee_v2_probe_size, uclogic_ugee_v2_probe_endpoint); if (rc) { uclogic_params_init_invalid(&p); goto output; } /* * Read the string descriptor containing pen and frame parameters. * The specific string descriptor and data were discovered by sniffing * the Windows driver traffic. */ rc = uclogic_params_get_str_desc(&str_desc, hdev, 100, str_desc_len); if (rc != str_desc_len) { hid_err(hdev, "failed retrieving pen and frame parameters: %d\n", rc); uclogic_params_init_invalid(&p); goto output; } rc = uclogic_params_parse_ugee_v2_desc(str_desc, str_desc_len, desc_params, ARRAY_SIZE(desc_params), &frame_type); if (rc) goto cleanup; kfree(str_desc); str_desc = NULL; /* Initialize the pen interface */ rdesc_pen = uclogic_rdesc_template_apply( uclogic_rdesc_ugee_v2_pen_template_arr, uclogic_rdesc_ugee_v2_pen_template_size, desc_params, ARRAY_SIZE(desc_params)); if (!rdesc_pen) { rc = -ENOMEM; goto cleanup; } p.pen.desc_ptr = rdesc_pen; p.pen.desc_size = uclogic_rdesc_ugee_v2_pen_template_size; p.pen.id = 0x02; p.pen.subreport_list[0].value = 0xf0; p.pen.subreport_list[0].id = UCLOGIC_RDESC_V1_FRAME_ID; /* Initialize the frame interface */ if (drvdata->quirks & UCLOGIC_MOUSE_FRAME_QUIRK) frame_type = UCLOGIC_PARAMS_FRAME_MOUSE; switch (frame_type) { case UCLOGIC_PARAMS_FRAME_DIAL: case UCLOGIC_PARAMS_FRAME_MOUSE: rc = uclogic_params_ugee_v2_init_frame_dial(&p, desc_params, ARRAY_SIZE(desc_params)); break; case UCLOGIC_PARAMS_FRAME_BUTTONS: default: rc = uclogic_params_ugee_v2_init_frame_buttons(&p, desc_params, ARRAY_SIZE(desc_params)); break; } if (rc) goto cleanup; /* Initialize the battery interface*/ if (uclogic_params_ugee_v2_has_battery(hdev)) { rc = uclogic_params_ugee_v2_init_battery(hdev, &p); if (rc) { hid_err(hdev, "error initializing battery: %d\n", rc); goto cleanup; } } /* Create a list of raw events to be ignored */ rc = uclogic_params_ugee_v2_init_event_hooks(hdev, &p); if (rc) { hid_err(hdev, "error initializing event hook list: %d\n", rc); goto cleanup; } output: /* Output parameters */ memcpy(params, &p, sizeof(*params)); memset(&p, 0, sizeof(p)); rc = 0; cleanup: kfree(str_desc); uclogic_params_cleanup(&p); return rc; } /* * uclogic_params_init_ugee_xppen_pro() - Initializes a UGEE XP-Pen Pro tablet device. * * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * @params: Parameters to fill in (to be cleaned with * uclogic_params_cleanup()). Not modified in case of error. * Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_init_ugee_xppen_pro(struct uclogic_params *params, struct hid_device *hdev, const u8 rdesc_pen_arr[], const size_t rdesc_pen_size, const u8 rdesc_frame_arr[], const size_t rdesc_frame_size, size_t str_desc_len) { int rc = 0; struct usb_interface *iface; __u8 bInterfaceNumber; u8 *str_desc = NULL; __u8 *rdesc_pen = NULL; s32 desc_params[UCLOGIC_RDESC_PH_ID_NUM]; enum uclogic_params_frame_type frame_type; /* The resulting parameters (noop) */ struct uclogic_params p = {0, }; if (!hdev || !params) { rc = -EINVAL; goto cleanup; } iface = to_usb_interface(hdev->dev.parent); bInterfaceNumber = iface->cur_altsetting->desc.bInterfaceNumber; /* Ignore non-pen interfaces */ if (bInterfaceNumber != 2) { rc = -EINVAL; uclogic_params_init_invalid(&p); goto cleanup; } /* * Initialize the interface by sending magic data. * This magic data is the same as other UGEE v2 tablets. */ rc = uclogic_probe_interface(hdev, uclogic_ugee_v2_probe_arr, uclogic_ugee_v2_probe_size, uclogic_ugee_v2_probe_endpoint); if (rc) { uclogic_params_init_invalid(&p); goto cleanup; } /** * Read the string descriptor containing pen and frame parameters. * These are slightly different than typical UGEE v2 devices. */ rc = uclogic_params_get_str_desc(&str_desc, hdev, 100, str_desc_len); if (rc != str_desc_len) { rc = (rc < 0) ? rc : -EINVAL; hid_err(hdev, "failed retrieving pen and frame parameters: %d\n", rc); uclogic_params_init_invalid(&p); goto cleanup; } rc = uclogic_params_parse_ugee_v2_desc(str_desc, str_desc_len, desc_params, ARRAY_SIZE(desc_params), &frame_type); if (rc) goto cleanup; // str_desc doesn't report the correct amount of buttons, so manually fix it desc_params[UCLOGIC_RDESC_FRAME_PH_ID_UM] = 20; kfree(str_desc); str_desc = NULL; /* Initialize the pen interface */ rdesc_pen = uclogic_rdesc_template_apply( rdesc_pen_arr, rdesc_pen_size, desc_params, ARRAY_SIZE(desc_params)); if (!rdesc_pen) { rc = -ENOMEM; goto cleanup; } p.pen.desc_ptr = rdesc_pen; p.pen.desc_size = rdesc_pen_size; p.pen.id = 0x02; p.pen.subreport_list[0].value = 0xf0; p.pen.subreport_list[0].id = UCLOGIC_RDESC_V1_FRAME_ID; /* Initialize the frame interface */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], rdesc_frame_arr, rdesc_frame_size, UCLOGIC_RDESC_V1_FRAME_ID); if (rc < 0) { hid_err(hdev, "initializing frame params failed: %d\n", rc); goto cleanup; } p.frame_list[0].bitmap_dial_byte = 7; p.frame_list[0].bitmap_second_dial_destination_byte = 8; /* Output parameters */ memcpy(params, &p, sizeof(*params)); memset(&p, 0, sizeof(p)); cleanup: kfree(str_desc); uclogic_params_cleanup(&p); return rc; } /** * uclogic_params_init() - initialize a tablet interface and discover its * parameters. * * @params: Parameters to fill in (to be cleaned with * uclogic_params_cleanup()). Not modified in case of error. * Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. Must be using the USB low-level * driver, i.e. be an actual USB tablet. * * Returns: * Zero, if successful. A negative errno code on error. */ int uclogic_params_init(struct uclogic_params *params, struct hid_device *hdev) { int rc; struct usb_device *udev; __u8 bNumInterfaces; struct usb_interface *iface; __u8 bInterfaceNumber; bool found; /* The resulting parameters (noop) */ struct uclogic_params p = {0, }; /* Check arguments */ if (params == NULL || hdev == NULL || !hid_is_usb(hdev)) { rc = -EINVAL; goto cleanup; } udev = hid_to_usb_dev(hdev); bNumInterfaces = udev->config->desc.bNumInterfaces; iface = to_usb_interface(hdev->dev.parent); bInterfaceNumber = iface->cur_altsetting->desc.bInterfaceNumber; /* * Set replacement report descriptor if the original matches the * specified size. Otherwise keep interface unchanged. */ #define WITH_OPT_DESC(_orig_desc_token, _new_desc_token) \ uclogic_params_init_with_opt_desc( \ &p, hdev, \ UCLOGIC_RDESC_##_orig_desc_token##_SIZE, \ uclogic_rdesc_##_new_desc_token##_arr, \ uclogic_rdesc_##_new_desc_token##_size) #define VID_PID(_vid, _pid) \ (((__u32)(_vid) << 16) | ((__u32)(_pid) & U16_MAX)) /* * Handle specific interfaces for specific tablets. * * Observe the following logic: * * If the interface is recognized as producing certain useful input: * Mark interface as valid. * Output interface parameters. * Else, if the interface is recognized as *not* producing any useful * input: * Mark interface as invalid. * Else: * Mark interface as valid. * Output noop parameters. * * Rule of thumb: it is better to disable a broken interface than let * it spew garbage input. */ switch (VID_PID(hdev->vendor, hdev->product)) { case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_PF1209): rc = WITH_OPT_DESC(PF1209_ORIG, pf1209_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U): rc = WITH_OPT_DESC(WPXXXXU_ORIG, wp4030u_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U): if (hdev->dev_rsize == UCLOGIC_RDESC_WP5540U_V2_ORIG_SIZE) { if (bInterfaceNumber == 0) { /* Try to probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } if (!found) { hid_warn(hdev, "pen parameters not found"); } } else { uclogic_params_init_invalid(&p); } } else { rc = WITH_OPT_DESC(WPXXXXU_ORIG, wp5540u_fixed); if (rc != 0) goto cleanup; } break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP8060U): rc = WITH_OPT_DESC(WPXXXXU_ORIG, wp8060u_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP1062): rc = WITH_OPT_DESC(WP1062_ORIG, wp1062_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_WIRELESS_TABLET_TWHL850): switch (bInterfaceNumber) { case 0: rc = WITH_OPT_DESC(TWHL850_ORIG0, twhl850_fixed0); if (rc != 0) goto cleanup; break; case 1: rc = WITH_OPT_DESC(TWHL850_ORIG1, twhl850_fixed1); if (rc != 0) goto cleanup; break; case 2: rc = WITH_OPT_DESC(TWHL850_ORIG2, twhl850_fixed2); if (rc != 0) goto cleanup; break; } break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_TWHA60): /* * If it is not a three-interface version, which is known to * respond to initialization. */ if (bNumInterfaces != 3) { switch (bInterfaceNumber) { case 0: rc = WITH_OPT_DESC(TWHA60_ORIG0, twha60_fixed0); if (rc != 0) goto cleanup; break; case 1: rc = WITH_OPT_DESC(TWHA60_ORIG1, twha60_fixed1); if (rc != 0) goto cleanup; break; } break; } fallthrough; case VID_PID(USB_VENDOR_ID_HUION, USB_DEVICE_ID_HUION_TABLET): case VID_PID(USB_VENDOR_ID_HUION, USB_DEVICE_ID_HUION_TABLET2): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_HUION_TABLET): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_YIYNOVA_TABLET): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_UGEE_TABLET_81): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_DRAWIMAGE_G3): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_UGEE_TABLET_45): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_UGEE_TABLET_47): rc = uclogic_params_huion_init(&p, hdev); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GP0610): case VID_PID(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GT5040): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_G540): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_G640): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_STAR06): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_RAINBOW_CV720): /* If this is the pen interface */ if (bInterfaceNumber == 1) { /* Probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } if (!found) { hid_warn(hdev, "pen parameters not found"); uclogic_params_init_invalid(&p); } } else { uclogic_params_init_invalid(&p); } break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO01): /* If this is the pen and frame interface */ if (bInterfaceNumber == 1) { /* Probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } /* Initialize frame parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_xppen_deco01_frame_arr, uclogic_rdesc_xppen_deco01_frame_size, 0); if (rc != 0) goto cleanup; } else { uclogic_params_init_invalid(&p); } break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_PARBLO_A610_PRO): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO01_V2): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_L): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_PRO_MW): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_PRO_S): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_PRO_SW): rc = uclogic_params_ugee_v2_init(&p, hdev); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_TRUST, USB_DEVICE_ID_TRUST_PANORA_TABLET): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_G5): /* Ignore non-pen interfaces */ if (bInterfaceNumber != 1) { uclogic_params_init_invalid(&p); break; } rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } else if (found) { rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_ugee_g5_frame_arr, uclogic_rdesc_ugee_g5_frame_size, UCLOGIC_RDESC_UGEE_G5_FRAME_ID); if (rc != 0) { hid_err(hdev, "failed creating frame parameters: %d\n", rc); goto cleanup; } p.frame_list[0].re_lsb = UCLOGIC_RDESC_UGEE_G5_FRAME_RE_LSB; p.frame_list[0].dev_id_byte = UCLOGIC_RDESC_UGEE_G5_FRAME_DEV_ID_BYTE; } else { hid_warn(hdev, "pen parameters not found"); uclogic_params_init_invalid(&p); } break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_EX07S): /* Ignore non-pen interfaces */ if (bInterfaceNumber != 1) { uclogic_params_init_invalid(&p); break; } rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } else if (found) { rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_ugee_ex07_frame_arr, uclogic_rdesc_ugee_ex07_frame_size, 0); if (rc != 0) { hid_err(hdev, "failed creating frame parameters: %d\n", rc); goto cleanup; } } else { hid_warn(hdev, "pen parameters not found"); uclogic_params_init_invalid(&p); } break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_22R_PRO): rc = uclogic_params_init_ugee_xppen_pro(&p, hdev, uclogic_rdesc_ugee_v2_pen_template_arr, uclogic_rdesc_ugee_v2_pen_template_size, uclogic_rdesc_xppen_artist_22r_pro_frame_arr, uclogic_rdesc_xppen_artist_22r_pro_frame_size, 12); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_24_PRO): rc = uclogic_params_init_ugee_xppen_pro(&p, hdev, uclogic_rdesc_xppen_artist_24_pro_pen_template_arr, uclogic_rdesc_xppen_artist_24_pro_pen_template_size, uclogic_rdesc_xppen_artist_24_pro_frame_arr, uclogic_rdesc_xppen_artist_24_pro_frame_size, 14); // The 24 Pro has a fragmented X Coord. p.pen.fragmented_hires2 = true; if (rc != 0) goto cleanup; break; } #undef VID_PID #undef WITH_OPT_DESC /* Output parameters */ memcpy(params, &p, sizeof(*params)); memset(&p, 0, sizeof(p)); rc = 0; cleanup: uclogic_params_cleanup(&p); return rc; } #ifdef CONFIG_HID_KUNIT_TEST #include "hid-uclogic-params-test.c" #endif |
| 104 5 50 12 21 37 9 1 1 13 27 1 17 2 8 2 33 31 29 9 7 25 24 3 1 2 1 2 1 1 2 5 3 3 3 3 1 4 3 1 1 3 8 2 2 1 1 1 2 2 2 1 3 3 4 6 3 3 3 9 4 11 10 1 31 31 1 1 3 2 27 1 20 98 89 1 1 2 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include "seq_oss_device.h" #include "seq_oss_synth.h" #include "seq_oss_midi.h" #include "seq_oss_event.h" #include "seq_oss_timer.h" #include <sound/seq_oss_legacy.h> #include "seq_oss_readq.h" #include "seq_oss_writeq.h" #include <linux/nospec.h> /* * prototypes */ static int extended_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev); static int chn_voice_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int chn_common_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int timing_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int local_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int old_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev); static int note_on_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev); static int note_off_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev); static int set_note_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int note, int vel, struct snd_seq_event *ev); static int set_control_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int param, int val, struct snd_seq_event *ev); static int set_echo_event(struct seq_oss_devinfo *dp, union evrec *rec, struct snd_seq_event *ev); /* * convert an OSS event to ALSA event * return 0 : enqueued * non-zero : invalid - ignored */ int snd_seq_oss_process_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { switch (q->s.code) { case SEQ_EXTENDED: return extended_event(dp, q, ev); case EV_CHN_VOICE: return chn_voice_event(dp, q, ev); case EV_CHN_COMMON: return chn_common_event(dp, q, ev); case EV_TIMING: return timing_event(dp, q, ev); case EV_SEQ_LOCAL: return local_event(dp, q, ev); case EV_SYSEX: return snd_seq_oss_synth_sysex(dp, q->x.dev, q->x.buf, ev); case SEQ_MIDIPUTC: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; /* put a midi byte */ if (! is_write_mode(dp->file_mode)) break; if (snd_seq_oss_midi_open(dp, q->s.dev, SNDRV_SEQ_OSS_FILE_WRITE)) break; if (snd_seq_oss_midi_filemode(dp, q->s.dev) & SNDRV_SEQ_OSS_FILE_WRITE) return snd_seq_oss_midi_putc(dp, q->s.dev, q->s.parm1, ev); break; case SEQ_ECHO: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; return set_echo_event(dp, q, ev); case SEQ_PRIVATE: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; return snd_seq_oss_synth_raw_event(dp, q->c[1], q->c, ev); default: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; return old_event(dp, q, ev); } return -EINVAL; } /* old type events: mode1 only */ static int old_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { switch (q->s.code) { case SEQ_NOTEOFF: return note_off_event(dp, 0, q->n.chn, q->n.note, q->n.vel, ev); case SEQ_NOTEON: return note_on_event(dp, 0, q->n.chn, q->n.note, q->n.vel, ev); case SEQ_WAIT: /* skip */ break; case SEQ_PGMCHANGE: return set_control_event(dp, 0, SNDRV_SEQ_EVENT_PGMCHANGE, q->n.chn, 0, q->n.note, ev); case SEQ_SYNCTIMER: return snd_seq_oss_timer_reset(dp->timer); } return -EINVAL; } /* 8bytes extended event: mode1 only */ static int extended_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { int val; switch (q->e.cmd) { case SEQ_NOTEOFF: return note_off_event(dp, q->e.dev, q->e.chn, q->e.p1, q->e.p2, ev); case SEQ_NOTEON: return note_on_event(dp, q->e.dev, q->e.chn, q->e.p1, q->e.p2, ev); case SEQ_PGMCHANGE: return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_PGMCHANGE, q->e.chn, 0, q->e.p1, ev); case SEQ_AFTERTOUCH: return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_CHANPRESS, q->e.chn, 0, q->e.p1, ev); case SEQ_BALANCE: /* convert -128:127 to 0:127 */ val = (char)q->e.p1; val = (val + 128) / 2; return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_CONTROLLER, q->e.chn, CTL_PAN, val, ev); case SEQ_CONTROLLER: val = ((short)q->e.p3 << 8) | (short)q->e.p2; switch (q->e.p1) { case CTRL_PITCH_BENDER: /* SEQ1 V2 control */ /* -0x2000:0x1fff */ return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_PITCHBEND, q->e.chn, 0, val, ev); case CTRL_PITCH_BENDER_RANGE: /* conversion: 100/semitone -> 128/semitone */ return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_REGPARAM, q->e.chn, 0, val*128/100, ev); default: return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_CONTROL14, q->e.chn, q->e.p1, val, ev); } case SEQ_VOLMODE: return snd_seq_oss_synth_raw_event(dp, q->e.dev, q->c, ev); } return -EINVAL; } /* channel voice events: mode1 and 2 */ static int chn_voice_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { if (q->v.chn >= 32) return -EINVAL; switch (q->v.cmd) { case MIDI_NOTEON: return note_on_event(dp, q->v.dev, q->v.chn, q->v.note, q->v.parm, ev); case MIDI_NOTEOFF: return note_off_event(dp, q->v.dev, q->v.chn, q->v.note, q->v.parm, ev); case MIDI_KEY_PRESSURE: return set_note_event(dp, q->v.dev, SNDRV_SEQ_EVENT_KEYPRESS, q->v.chn, q->v.note, q->v.parm, ev); } return -EINVAL; } /* channel common events: mode1 and 2 */ static int chn_common_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { if (q->l.chn >= 32) return -EINVAL; switch (q->l.cmd) { case MIDI_PGM_CHANGE: return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_PGMCHANGE, q->l.chn, 0, q->l.p1, ev); case MIDI_CTL_CHANGE: return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_CONTROLLER, q->l.chn, q->l.p1, q->l.val, ev); case MIDI_PITCH_BEND: /* conversion: 0:0x3fff -> -0x2000:0x1fff */ return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_PITCHBEND, q->l.chn, 0, q->l.val - 8192, ev); case MIDI_CHN_PRESSURE: return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_CHANPRESS, q->l.chn, 0, q->l.val, ev); } return -EINVAL; } /* timer events: mode1 and mode2 */ static int timing_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { switch (q->t.cmd) { case TMR_ECHO: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return set_echo_event(dp, q, ev); else { union evrec tmp; memset(&tmp, 0, sizeof(tmp)); /* XXX: only for little-endian! */ tmp.echo = (q->t.time << 8) | SEQ_ECHO; return set_echo_event(dp, &tmp, ev); } case TMR_STOP: if (dp->seq_mode) return snd_seq_oss_timer_stop(dp->timer); return 0; case TMR_CONTINUE: if (dp->seq_mode) return snd_seq_oss_timer_continue(dp->timer); return 0; case TMR_TEMPO: if (dp->seq_mode) return snd_seq_oss_timer_tempo(dp->timer, q->t.time); return 0; } return -EINVAL; } /* local events: mode1 and 2 */ static int local_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { return -EINVAL; } /* * process note-on event for OSS synth * three different modes are available: * - SNDRV_SEQ_OSS_PROCESS_EVENTS (for one-voice per channel mode) * Accept note 255 as volume change. * - SNDRV_SEQ_OSS_PASS_EVENTS * Pass all events to lowlevel driver anyway * - SNDRV_SEQ_OSS_PROCESS_KEYPRESS (mostly for Emu8000) * Use key-pressure if note >= 128 */ static int note_on_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info; info = snd_seq_oss_synth_info(dp, dev); if (!info) return -ENXIO; switch (info->arg.event_passing) { case SNDRV_SEQ_OSS_PROCESS_EVENTS: if (! info->ch || ch < 0 || ch >= info->nr_voices) { /* pass directly */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); } ch = array_index_nospec(ch, info->nr_voices); if (note == 255 && info->ch[ch].note >= 0) { /* volume control */ int type; if (info->ch[ch].vel) /* sample already started -- volume change */ type = SNDRV_SEQ_EVENT_KEYPRESS; else /* sample not started -- start now */ type = SNDRV_SEQ_EVENT_NOTEON; info->ch[ch].vel = vel; return set_note_event(dp, dev, type, ch, info->ch[ch].note, vel, ev); } else if (note >= 128) return -EINVAL; /* invalid */ if (note != info->ch[ch].note && info->ch[ch].note >= 0) /* note changed - note off at beginning */ set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEOFF, ch, info->ch[ch].note, 0, ev); /* set current status */ info->ch[ch].note = note; info->ch[ch].vel = vel; if (vel) /* non-zero velocity - start the note now */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); return -EINVAL; case SNDRV_SEQ_OSS_PASS_EVENTS: /* pass the event anyway */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); case SNDRV_SEQ_OSS_PROCESS_KEYPRESS: if (note >= 128) /* key pressure: shifted by 128 */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_KEYPRESS, ch, note - 128, vel, ev); else /* normal note-on event */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); } return -EINVAL; } /* * process note-off event for OSS synth */ static int note_off_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info; info = snd_seq_oss_synth_info(dp, dev); if (!info) return -ENXIO; switch (info->arg.event_passing) { case SNDRV_SEQ_OSS_PROCESS_EVENTS: if (! info->ch || ch < 0 || ch >= info->nr_voices) { /* pass directly */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); } ch = array_index_nospec(ch, info->nr_voices); if (info->ch[ch].note >= 0) { note = info->ch[ch].note; info->ch[ch].vel = 0; info->ch[ch].note = -1; return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEOFF, ch, note, vel, ev); } return -EINVAL; /* invalid */ case SNDRV_SEQ_OSS_PASS_EVENTS: case SNDRV_SEQ_OSS_PROCESS_KEYPRESS: /* pass the event anyway */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEOFF, ch, note, vel, ev); } return -EINVAL; } /* * create a note event */ static int set_note_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int note, int vel, struct snd_seq_event *ev) { if (!snd_seq_oss_synth_info(dp, dev)) return -ENXIO; ev->type = type; snd_seq_oss_synth_addr(dp, dev, ev); ev->data.note.channel = ch; ev->data.note.note = note; ev->data.note.velocity = vel; return 0; } /* * create a control event */ static int set_control_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int param, int val, struct snd_seq_event *ev) { if (!snd_seq_oss_synth_info(dp, dev)) return -ENXIO; ev->type = type; snd_seq_oss_synth_addr(dp, dev, ev); ev->data.control.channel = ch; ev->data.control.param = param; ev->data.control.value = val; return 0; } /* * create an echo event */ static int set_echo_event(struct seq_oss_devinfo *dp, union evrec *rec, struct snd_seq_event *ev) { ev->type = SNDRV_SEQ_EVENT_ECHO; /* echo back to itself */ snd_seq_oss_fill_addr(dp, ev, dp->addr.client, dp->addr.port); memcpy(&ev->data, rec, LONG_EVENT_SIZE); return 0; } /* * event input callback from ALSA sequencer: * the echo event is processed here. */ int snd_seq_oss_event_input(struct snd_seq_event *ev, int direct, void *private_data, int atomic, int hop) { struct seq_oss_devinfo *dp = (struct seq_oss_devinfo *)private_data; union evrec *rec; if (ev->type != SNDRV_SEQ_EVENT_ECHO) return snd_seq_oss_midi_input(ev, direct, private_data); if (ev->source.client != dp->cseq) return 0; /* ignored */ rec = (union evrec*)&ev->data; if (rec->s.code == SEQ_SYNCTIMER) { /* sync echo back */ snd_seq_oss_writeq_wakeup(dp->writeq, rec->t.time); } else { /* echo back event */ if (dp->readq == NULL) return 0; snd_seq_oss_readq_put_event(dp->readq, rec); } return 0; } |
| 3 2 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2021 Hans de Goede <hdegoede@redhat.com> * * Driver for the LetSketch / VSON WP9620N drawing tablet. * This drawing tablet is also sold under other brand names such as Case U, * presumably this driver will work for all of them. But it has only been * tested with a LetSketch WP9620N model. * * These tablets also work without a special HID driver, but then only part * of the active area works and both the pad and stylus buttons are hardwired * to special key-combos. E.g. the 2 stylus buttons send right mouse clicks / * resp. "e" key presses. * * This device has 4 USB interfaces: * * Interface 0 EP 0x81 bootclass mouse, rdesc len 18, report id 0x08, * Application(ff00.0001) * This interface sends raw event input reports in a custom format, but only * after doing the special dance from letsketch_probe(). After enabling this * interface the other 3 interfaces are disabled. * * Interface 1 EP 0x82 bootclass mouse, rdesc len 83, report id 0x0a, Tablet * This interface sends absolute events for the pen, including pressure, * but only for some part of the active area due to special "aspect ratio" * correction and only half by default since it assumes it will be used * with a phone in portraid mode, while using the tablet in landscape mode. * Also stylus + pad button events are not reported here. * * Interface 2 EP 0x83 bootclass keybd, rdesc len 64, report id none, Std Kbd * This interfaces send various hard-coded key-combos for the pad buttons * and "e" keypresses for the 2nd stylus button * * Interface 3 EP 0x84 bootclass mouse, rdesc len 75, report id 0x01, Std Mouse * This reports right-click mouse-button events for the 1st stylus button */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/usb.h> #include <linux/unaligned.h> #include "hid-ids.h" #define LETSKETCH_RAW_IF 0 #define LETSKETCH_RAW_DATA_LEN 12 #define LETSKETCH_RAW_REPORT_ID 8 #define LETSKETCH_PAD_BUTTONS 5 #define LETSKETCH_INFO_STR_IDX_BEGIN 0xc8 #define LETSKETCH_INFO_STR_IDX_END 0xca #define LETSKETCH_GET_STRING_RETRIES 5 struct letsketch_data { struct hid_device *hdev; struct input_dev *input_tablet; struct input_dev *input_tablet_pad; struct timer_list inrange_timer; }; static int letsketch_open(struct input_dev *dev) { struct letsketch_data *data = input_get_drvdata(dev); return hid_hw_open(data->hdev); } static void letsketch_close(struct input_dev *dev) { struct letsketch_data *data = input_get_drvdata(dev); hid_hw_close(data->hdev); } static struct input_dev *letsketch_alloc_input_dev(struct letsketch_data *data) { struct input_dev *input; input = devm_input_allocate_device(&data->hdev->dev); if (!input) return NULL; input->id.bustype = data->hdev->bus; input->id.vendor = data->hdev->vendor; input->id.product = data->hdev->product; input->id.version = data->hdev->bus; input->phys = data->hdev->phys; input->uniq = data->hdev->uniq; input->open = letsketch_open; input->close = letsketch_close; input_set_drvdata(input, data); return input; } static int letsketch_setup_input_tablet(struct letsketch_data *data) { struct input_dev *input; input = letsketch_alloc_input_dev(data); if (!input) return -ENOMEM; input_set_abs_params(input, ABS_X, 0, 50800, 0, 0); input_set_abs_params(input, ABS_Y, 0, 31750, 0, 0); input_set_abs_params(input, ABS_PRESSURE, 0, 8192, 0, 0); input_abs_set_res(input, ABS_X, 240); input_abs_set_res(input, ABS_Y, 225); input_set_capability(input, EV_KEY, BTN_TOUCH); input_set_capability(input, EV_KEY, BTN_TOOL_PEN); input_set_capability(input, EV_KEY, BTN_STYLUS); input_set_capability(input, EV_KEY, BTN_STYLUS2); /* All known brands selling this tablet use WP9620[N] as model name */ input->name = "WP9620 Tablet"; data->input_tablet = input; return input_register_device(data->input_tablet); } static int letsketch_setup_input_tablet_pad(struct letsketch_data *data) { struct input_dev *input; int i; input = letsketch_alloc_input_dev(data); if (!input) return -ENOMEM; for (i = 0; i < LETSKETCH_PAD_BUTTONS; i++) input_set_capability(input, EV_KEY, BTN_0 + i); /* * These are never send on the pad input_dev, but must be set * on the Pad to make udev / libwacom happy. */ input_set_abs_params(input, ABS_X, 0, 1, 0, 0); input_set_abs_params(input, ABS_Y, 0, 1, 0, 0); input_set_capability(input, EV_KEY, BTN_STYLUS); input->name = "WP9620 Pad"; data->input_tablet_pad = input; return input_register_device(data->input_tablet_pad); } static void letsketch_inrange_timeout(struct timer_list *t) { struct letsketch_data *data = timer_container_of(data, t, inrange_timer); struct input_dev *input = data->input_tablet; input_report_key(input, BTN_TOOL_PEN, 0); input_sync(input); } static int letsketch_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *raw_data, int size) { struct letsketch_data *data = hid_get_drvdata(hdev); struct input_dev *input; int i; if (size != LETSKETCH_RAW_DATA_LEN || raw_data[0] != LETSKETCH_RAW_REPORT_ID) return 0; switch (raw_data[1] & 0xf0) { case 0x80: /* Pen data */ input = data->input_tablet; input_report_key(input, BTN_TOOL_PEN, 1); input_report_key(input, BTN_TOUCH, raw_data[1] & 0x01); input_report_key(input, BTN_STYLUS, raw_data[1] & 0x02); input_report_key(input, BTN_STYLUS2, raw_data[1] & 0x04); input_report_abs(input, ABS_X, get_unaligned_le16(raw_data + 2)); input_report_abs(input, ABS_Y, get_unaligned_le16(raw_data + 4)); input_report_abs(input, ABS_PRESSURE, get_unaligned_le16(raw_data + 6)); /* * There is no out of range event, so use a timer for this * when in range we get an event approx. every 8 ms. */ mod_timer(&data->inrange_timer, jiffies + msecs_to_jiffies(100)); break; case 0xe0: /* Pad data */ input = data->input_tablet_pad; for (i = 0; i < LETSKETCH_PAD_BUTTONS; i++) input_report_key(input, BTN_0 + i, raw_data[4] == (i + 1)); break; default: hid_warn(data->hdev, "Warning unknown data header: 0x%02x\n", raw_data[0]); return 0; } input_sync(input); return 0; } /* * The tablets magic handshake to put it in raw mode relies on getting * string descriptors. But the firmware is buggy and does not like it if * we do this too fast. Even if we go slow sometimes the usb_string() call * fails. Ignore errors and retry it a couple of times if necessary. */ static int letsketch_get_string(struct usb_device *udev, int index, char *buf, int size) { int i, ret; for (i = 0; i < LETSKETCH_GET_STRING_RETRIES; i++) { usleep_range(5000, 7000); ret = usb_string(udev, index, buf, size); if (ret > 0) return 0; } dev_err(&udev->dev, "Max retries (%d) exceeded reading string descriptor %d\n", LETSKETCH_GET_STRING_RETRIES, index); return ret ? ret : -EIO; } static int letsketch_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct device *dev = &hdev->dev; struct letsketch_data *data; struct usb_interface *intf; struct usb_device *udev; char buf[256]; int i, ret; if (!hid_is_usb(hdev)) return -ENODEV; intf = to_usb_interface(hdev->dev.parent); if (intf->altsetting->desc.bInterfaceNumber != LETSKETCH_RAW_IF) return -ENODEV; /* Ignore the other interfaces */ udev = interface_to_usbdev(intf); /* * Instead of using a set-feature request, or even a custom USB ctrl * message the tablet needs this elaborate magic reading of USB * string descriptors to kick it into raw mode. This is what the * Windows drivers are seen doing in an USB trace under Windows. */ for (i = LETSKETCH_INFO_STR_IDX_BEGIN; i <= LETSKETCH_INFO_STR_IDX_END; i++) { ret = letsketch_get_string(udev, i, buf, sizeof(buf)); if (ret) return ret; hid_info(hdev, "Device info: %s\n", buf); } for (i = 1; i <= 250; i++) { ret = letsketch_get_string(udev, i, buf, sizeof(buf)); if (ret) return ret; } ret = letsketch_get_string(udev, 0x64, buf, sizeof(buf)); if (ret) return ret; ret = letsketch_get_string(udev, LETSKETCH_INFO_STR_IDX_BEGIN, buf, sizeof(buf)); if (ret) return ret; /* * The tablet should be in raw mode now, end with a final delay before * doing further IO to the device. */ usleep_range(5000, 7000); ret = hid_parse(hdev); if (ret) return ret; data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->hdev = hdev; timer_setup(&data->inrange_timer, letsketch_inrange_timeout, 0); hid_set_drvdata(hdev, data); ret = letsketch_setup_input_tablet(data); if (ret) return ret; ret = letsketch_setup_input_tablet_pad(data); if (ret) return ret; return hid_hw_start(hdev, HID_CONNECT_HIDRAW); } static const struct hid_device_id letsketch_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_LETSKETCH, USB_DEVICE_ID_WP9620N) }, { } }; MODULE_DEVICE_TABLE(hid, letsketch_devices); static struct hid_driver letsketch_driver = { .name = "letsketch", .id_table = letsketch_devices, .probe = letsketch_probe, .raw_event = letsketch_raw_event, }; module_hid_driver(letsketch_driver); MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>"); MODULE_DESCRIPTION("Driver for the LetSketch / VSON WP9620N drawing tablet"); MODULE_LICENSE("GPL"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vsyscall #if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __VSYSCALL_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(emulate_vsyscall, TP_PROTO(int nr), TP_ARGS(nr), TP_STRUCT__entry(__field(int, nr)), TP_fast_assign( __entry->nr = nr; ), TP_printk("nr = %d", __entry->nr) ); #endif #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ #define TRACE_INCLUDE_FILE vsyscall_trace #include <trace/define_trace.h> |
| 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 10 11 11 11 11 28 1 28 28 28 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | // SPDX-License-Identifier: GPL-2.0 #include <linux/smp.h> #include <linux/timex.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/cpufreq.h> #include <asm/prctl.h> #include <linux/proc_fs.h> #include "cpu.h" #ifdef CONFIG_X86_VMX_FEATURE_NAMES extern const char * const x86_vmx_flags[NVMXINTS*32]; #endif /* * Get CPU information for use by the procfs. */ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id); seq_printf(m, "siblings\t: %d\n", cpumask_weight(topology_core_cpumask(cpu))); seq_printf(m, "core id\t\t: %d\n", c->topo.core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid); seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid); #endif } #ifdef CONFIG_X86_32 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) { seq_printf(m, "fdiv_bug\t: %s\n" "f00f_bug\t: %s\n" "coma_bug\t: %s\n" "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)), str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)), str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)), str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), c->cpuid_level); } #else static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) { seq_printf(m, "fpu\t\t: yes\n" "fpu_exception\t: yes\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", c->cpuid_level); } #endif static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; unsigned int cpu; int i; cpu = c->cpu_index; seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" "model\t\t: %u\n" "model name\t: %s\n", cpu, c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", c->x86, c->x86_model, c->x86_model_id[0] ? c->x86_model_id : "unknown"); if (c->x86_stepping || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_stepping); else seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { int freq = arch_freq_get_on_cpu(cpu); if (freq < 0) seq_puts(m, "cpu MHz\t\t: Unknown\n"); else seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ if (c->x86_cache_size) seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size); show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); seq_puts(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); #ifdef CONFIG_X86_VMX_FEATURE_NAMES if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) { seq_puts(m, "\nvmx flags\t:"); for (i = 0; i < 32*NVMXINTS; i++) { if (test_bit(i, (unsigned long *)c->vmx_capability) && x86_vmx_flags[i] != NULL) seq_printf(m, " %s", x86_vmx_flags[i]); } } #endif seq_puts(m, "\nbugs\t\t:"); for (i = 0; i < 32*NBUGINTS; i++) { unsigned int bug_bit = 32*NCAPINTS + i; if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i]) seq_printf(m, " %s", x86_bug_flags[i]); } seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); #ifdef CONFIG_X86_64 if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); seq_puts(m, "power management:"); for (i = 0; i < 32; i++) { if (c->x86_power & (1 << i)) { if (i < ARRAY_SIZE(x86_power_flags) && x86_power_flags[i]) seq_printf(m, "%s%s", x86_power_flags[i][0] ? " " : "", x86_power_flags[i]); else seq_printf(m, " [%d]", i); } } seq_puts(m, "\n\n"); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) { } const struct seq_operations cpuinfo_op = { .start = c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo, }; #ifdef CONFIG_X86_USER_SHADOW_STACK static void dump_x86_features(struct seq_file *m, unsigned long features) { if (features & ARCH_SHSTK_SHSTK) seq_puts(m, "shstk "); if (features & ARCH_SHSTK_WRSS) seq_puts(m, "wrss "); } void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) { seq_puts(m, "x86_Thread_features:\t"); dump_x86_features(m, task->thread.features); seq_putc(m, '\n'); seq_puts(m, "x86_Thread_features_locked:\t"); dump_x86_features(m, task->thread.features_locked); seq_putc(m, '\n'); } #endif /* CONFIG_X86_USER_SHADOW_STACK */ |
| 1 10 11 10 1 1 1 7 6 6 2 2 10 10 10 10 10 10 10 13 3 10 1 10 10 9 9 9 1 3 5 3 6 8 1 12 1 2 2 1 10 10 6 4 2 2 12 1 12 4 8 11 11 11 11 11 11 8 3 3 10 5 1 4 4 3 2 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/pkt_sched.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_bonding.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <asm/byteorder.h> #include <net/bonding.h> #include <net/bond_alb.h> static const u8 mac_v6_allmcast[ETH_ALEN + 2] __long_aligned = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x01 }; static const int alb_delta_in_ticks = HZ / ALB_TIMER_TICKS_PER_SEC; #pragma pack(1) struct learning_pkt { u8 mac_dst[ETH_ALEN]; u8 mac_src[ETH_ALEN]; __be16 type; u8 padding[ETH_ZLEN - ETH_HLEN]; }; struct arp_pkt { __be16 hw_addr_space; __be16 prot_addr_space; u8 hw_addr_len; u8 prot_addr_len; __be16 op_code; u8 mac_src[ETH_ALEN]; /* sender hardware address */ __be32 ip_src; /* sender IP address */ u8 mac_dst[ETH_ALEN]; /* target hardware address */ __be32 ip_dst; /* target IP address */ }; #pragma pack() /* Forward declaration */ static void alb_send_learning_packets(struct slave *slave, const u8 mac_addr[], bool strict_match); static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp); static void rlb_src_unlink(struct bonding *bond, u32 index); static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash); static inline u8 _simple_hash(const u8 *hash_start, int hash_size) { int i; u8 hash = 0; for (i = 0; i < hash_size; i++) hash ^= hash_start[i]; return hash; } /*********************** tlb specific functions ***************************/ static inline void tlb_init_table_entry(struct tlb_client_info *entry, int save_load) { if (save_load) { entry->load_history = 1 + entry->tx_bytes / BOND_TLB_REBALANCE_INTERVAL; entry->tx_bytes = 0; } entry->tx_slave = NULL; entry->next = TLB_NULL_INDEX; entry->prev = TLB_NULL_INDEX; } static inline void tlb_init_slave(struct slave *slave) { SLAVE_TLB_INFO(slave).load = 0; SLAVE_TLB_INFO(slave).head = TLB_NULL_INDEX; } static void __tlb_clear_slave(struct bonding *bond, struct slave *slave, int save_load) { struct tlb_client_info *tx_hash_table; u32 index; /* clear slave from tx_hashtbl */ tx_hash_table = BOND_ALB_INFO(bond).tx_hashtbl; /* skip this if we've already freed the tx hash table */ if (tx_hash_table) { index = SLAVE_TLB_INFO(slave).head; while (index != TLB_NULL_INDEX) { u32 next_index = tx_hash_table[index].next; tlb_init_table_entry(&tx_hash_table[index], save_load); index = next_index; } } tlb_init_slave(slave); } static void tlb_clear_slave(struct bonding *bond, struct slave *slave, int save_load) { spin_lock_bh(&bond->mode_lock); __tlb_clear_slave(bond, slave, save_load); spin_unlock_bh(&bond->mode_lock); } /* Must be called before starting the monitor timer */ static int tlb_initialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); int size = TLB_HASH_TABLE_SIZE * sizeof(struct tlb_client_info); struct tlb_client_info *new_hashtbl; int i; new_hashtbl = kzalloc(size, GFP_KERNEL); if (!new_hashtbl) return -ENOMEM; spin_lock_bh(&bond->mode_lock); bond_info->tx_hashtbl = new_hashtbl; for (i = 0; i < TLB_HASH_TABLE_SIZE; i++) tlb_init_table_entry(&bond_info->tx_hashtbl[i], 0); spin_unlock_bh(&bond->mode_lock); return 0; } /* Must be called only after all slaves have been released */ static void tlb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); spin_lock_bh(&bond->mode_lock); kfree(bond_info->tx_hashtbl); bond_info->tx_hashtbl = NULL; spin_unlock_bh(&bond->mode_lock); } static long long compute_gap(struct slave *slave) { return (s64) (slave->speed << 20) - /* Convert to Megabit per sec */ (s64) (SLAVE_TLB_INFO(slave).load << 3); /* Bytes to bits */ } static struct slave *tlb_get_least_loaded_slave(struct bonding *bond) { struct slave *slave, *least_loaded; struct list_head *iter; long long max_gap; least_loaded = NULL; max_gap = LLONG_MIN; /* Find the slave with the largest gap */ bond_for_each_slave_rcu(bond, slave, iter) { if (bond_slave_can_tx(slave)) { long long gap = compute_gap(slave); if (max_gap < gap) { least_loaded = slave; max_gap = gap; } } } return least_loaded; } static struct slave *__tlb_choose_channel(struct bonding *bond, u32 hash_index, u32 skb_len) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct tlb_client_info *hash_table; struct slave *assigned_slave; hash_table = bond_info->tx_hashtbl; assigned_slave = hash_table[hash_index].tx_slave; if (!assigned_slave) { assigned_slave = tlb_get_least_loaded_slave(bond); if (assigned_slave) { struct tlb_slave_info *slave_info = &(SLAVE_TLB_INFO(assigned_slave)); u32 next_index = slave_info->head; hash_table[hash_index].tx_slave = assigned_slave; hash_table[hash_index].next = next_index; hash_table[hash_index].prev = TLB_NULL_INDEX; if (next_index != TLB_NULL_INDEX) hash_table[next_index].prev = hash_index; slave_info->head = hash_index; slave_info->load += hash_table[hash_index].load_history; } } if (assigned_slave) hash_table[hash_index].tx_bytes += skb_len; return assigned_slave; } static struct slave *tlb_choose_channel(struct bonding *bond, u32 hash_index, u32 skb_len) { struct slave *tx_slave; /* We don't need to disable softirq here, because * tlb_choose_channel() is only called by bond_alb_xmit() * which already has softirq disabled. */ spin_lock(&bond->mode_lock); tx_slave = __tlb_choose_channel(bond, hash_index, skb_len); spin_unlock(&bond->mode_lock); return tx_slave; } /*********************** rlb specific functions ***************************/ /* when an ARP REPLY is received from a client update its info * in the rx_hashtbl */ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = _simple_hash((u8 *)&(arp->ip_src), sizeof(arp->ip_src)); client_info = &(bond_info->rx_hashtbl[hash_index]); if ((client_info->assigned) && (client_info->ip_src == arp->ip_dst) && (client_info->ip_dst == arp->ip_src) && (!ether_addr_equal_64bits(client_info->mac_dst, arp->mac_src))) { /* update the clients MAC address */ ether_addr_copy(client_info->mac_dst, arp->mac_src); client_info->ntt = 1; bond_info->rx_ntt = 1; } spin_unlock_bh(&bond->mode_lock); } static int rlb_arp_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct arp_pkt *arp, _arp; if (skb->protocol != cpu_to_be16(ETH_P_ARP)) goto out; arp = skb_header_pointer(skb, 0, sizeof(_arp), &_arp); if (!arp) goto out; /* We received an ARP from arp->ip_src. * We might have used this IP address previously (on the bonding host * itself or on a system that is bridged together with the bond). * However, if arp->mac_src is different than what is stored in * rx_hashtbl, some other host is now using the IP and we must prevent * sending out client updates with this IP address and the old MAC * address. * Clean up all hash table entries that have this address as ip_src but * have a different mac_src. */ rlb_purge_src_ip(bond, arp); if (arp->op_code == htons(ARPOP_REPLY)) { /* update rx hash table for this ARP */ rlb_update_entry_from_arp(bond, arp); slave_dbg(bond->dev, slave->dev, "Server received an ARP Reply from client\n"); } out: return RX_HANDLER_ANOTHER; } /* Caller must hold rcu_read_lock() */ static struct slave *__rlb_next_rx_slave(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *before = NULL, *rx_slave = NULL, *slave; struct list_head *iter; bool found = false; bond_for_each_slave_rcu(bond, slave, iter) { if (!bond_slave_can_tx(slave)) continue; if (!found) { if (!before || before->speed < slave->speed) before = slave; } else { if (!rx_slave || rx_slave->speed < slave->speed) rx_slave = slave; } if (slave == bond_info->rx_slave) found = true; } /* we didn't find anything after the current or we have something * better before and up to the current slave */ if (!rx_slave || (before && rx_slave->speed < before->speed)) rx_slave = before; if (rx_slave) bond_info->rx_slave = rx_slave; return rx_slave; } /* Caller must hold RTNL, rcu_read_lock is obtained only to silence checkers */ static struct slave *rlb_next_rx_slave(struct bonding *bond) { struct slave *rx_slave; ASSERT_RTNL(); rcu_read_lock(); rx_slave = __rlb_next_rx_slave(bond); rcu_read_unlock(); return rx_slave; } /* teach the switch the mac of a disabled slave * on the primary for fault tolerance * * Caller must hold RTNL */ static void rlb_teach_disabled_mac_on_primary(struct bonding *bond, const u8 addr[]) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (!curr_active) return; if (!bond->alb_info.primary_is_promisc) { if (!dev_set_promiscuity(curr_active->dev, 1)) bond->alb_info.primary_is_promisc = 1; else bond->alb_info.primary_is_promisc = 0; } bond->alb_info.rlb_promisc_timeout_counter = 0; alb_send_learning_packets(curr_active, addr, true); } /* slave being removed should not be active at this point * * Caller must hold rtnl. */ static void rlb_clear_slave(struct bonding *bond, struct slave *slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *rx_hash_table; u32 index, next_index; /* clear slave from rx_hashtbl */ spin_lock_bh(&bond->mode_lock); rx_hash_table = bond_info->rx_hashtbl; index = bond_info->rx_hashtbl_used_head; for (; index != RLB_NULL_INDEX; index = next_index) { next_index = rx_hash_table[index].used_next; if (rx_hash_table[index].slave == slave) { struct slave *assigned_slave = rlb_next_rx_slave(bond); if (assigned_slave) { rx_hash_table[index].slave = assigned_slave; if (is_valid_ether_addr(rx_hash_table[index].mac_dst)) { bond_info->rx_hashtbl[index].ntt = 1; bond_info->rx_ntt = 1; /* A slave has been removed from the * table because it is either disabled * or being released. We must retry the * update to avoid clients from not * being updated & disconnecting when * there is stress */ bond_info->rlb_update_retry_counter = RLB_UPDATE_RETRY; } } else { /* there is no active slave */ rx_hash_table[index].slave = NULL; } } } spin_unlock_bh(&bond->mode_lock); if (slave != rtnl_dereference(bond->curr_active_slave)) rlb_teach_disabled_mac_on_primary(bond, slave->dev->dev_addr); } static void rlb_update_client(struct rlb_client_info *client_info) { int i; if (!client_info->slave || !is_valid_ether_addr(client_info->mac_dst)) return; for (i = 0; i < RLB_ARP_BURST_SIZE; i++) { struct sk_buff *skb; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, client_info->ip_dst, client_info->slave->dev, client_info->ip_src, client_info->mac_dst, client_info->slave->dev->dev_addr, client_info->mac_dst); if (!skb) { slave_err(client_info->slave->bond->dev, client_info->slave->dev, "failed to create an ARP packet\n"); continue; } skb->dev = client_info->slave->dev; if (client_info->vlan_id) { __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), client_info->vlan_id); } arp_xmit(skb); } } /* sends ARP REPLIES that update the clients that need updating */ static void rlb_update_rx_clients(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if (client_info->ntt) { rlb_update_client(client_info); if (bond_info->rlb_update_retry_counter == 0) client_info->ntt = 0; } } /* do not update the entries again until this counter is zero so that * not to confuse the clients. */ bond_info->rlb_update_delay_counter = RLB_UPDATE_DELAY; spin_unlock_bh(&bond->mode_lock); } /* The slave was assigned a new mac address - update the clients */ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; int ntt = 0; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if ((client_info->slave == slave) && is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; ntt = 1; } } /* update the team's flag only after the whole iteration */ if (ntt) { bond_info->rx_ntt = 1; /* fasten the change */ bond_info->rlb_update_retry_counter = RLB_UPDATE_RETRY; } spin_unlock_bh(&bond->mode_lock); } /* mark all clients using src_ip to be updated */ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if (!client_info->slave) { netdev_err(bond->dev, "found a client with no channel in the client's hash table\n"); continue; } /* update all clients using this src_ip, that are not assigned * to the team's address (curr_active_slave) and have a known * unicast mac address. */ if ((client_info->ip_src == src_ip) && !ether_addr_equal_64bits(client_info->slave->dev->dev_addr, bond->dev->dev_addr) && is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; bond_info->rx_ntt = 1; } } spin_unlock(&bond->mode_lock); } static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bond, const struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *assigned_slave, *curr_active_slave; struct rlb_client_info *client_info; u32 hash_index = 0; spin_lock(&bond->mode_lock); curr_active_slave = rcu_dereference(bond->curr_active_slave); hash_index = _simple_hash((u8 *)&arp->ip_dst, sizeof(arp->ip_dst)); client_info = &(bond_info->rx_hashtbl[hash_index]); if (client_info->assigned) { if ((client_info->ip_src == arp->ip_src) && (client_info->ip_dst == arp->ip_dst)) { /* the entry is already assigned to this client */ if (!is_broadcast_ether_addr(arp->mac_dst)) { /* update mac address from arp */ ether_addr_copy(client_info->mac_dst, arp->mac_dst); } ether_addr_copy(client_info->mac_src, arp->mac_src); assigned_slave = client_info->slave; if (assigned_slave) { spin_unlock(&bond->mode_lock); return assigned_slave; } } else { /* the entry is already assigned to some other client, * move the old client to primary (curr_active_slave) so * that the new client can be assigned to this entry. */ if (curr_active_slave && client_info->slave != curr_active_slave) { client_info->slave = curr_active_slave; rlb_update_client(client_info); } } } /* assign a new slave */ assigned_slave = __rlb_next_rx_slave(bond); if (assigned_slave) { if (!(client_info->assigned && client_info->ip_src == arp->ip_src)) { /* ip_src is going to be updated, * fix the src hash list */ u32 hash_src = _simple_hash((u8 *)&arp->ip_src, sizeof(arp->ip_src)); rlb_src_unlink(bond, hash_index); rlb_src_link(bond, hash_src, hash_index); } client_info->ip_src = arp->ip_src; client_info->ip_dst = arp->ip_dst; /* arp->mac_dst is broadcast for arp requests. * will be updated with clients actual unicast mac address * upon receiving an arp reply. */ ether_addr_copy(client_info->mac_dst, arp->mac_dst); ether_addr_copy(client_info->mac_src, arp->mac_src); client_info->slave = assigned_slave; if (is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; bond->alb_info.rx_ntt = 1; } else { client_info->ntt = 0; } if (vlan_get_tag(skb, &client_info->vlan_id)) client_info->vlan_id = 0; if (!client_info->assigned) { u32 prev_tbl_head = bond_info->rx_hashtbl_used_head; bond_info->rx_hashtbl_used_head = hash_index; client_info->used_next = prev_tbl_head; if (prev_tbl_head != RLB_NULL_INDEX) { bond_info->rx_hashtbl[prev_tbl_head].used_prev = hash_index; } client_info->assigned = 1; } } spin_unlock(&bond->mode_lock); return assigned_slave; } /* chooses (and returns) transmit channel for arp reply * does not choose channel for other arp types since they are * sent on the curr_active_slave */ static struct slave *rlb_arp_xmit(struct sk_buff *skb, struct bonding *bond) { struct slave *tx_slave = NULL; struct net_device *dev; struct arp_pkt *arp; if (!pskb_network_may_pull(skb, sizeof(*arp))) return NULL; arp = (struct arp_pkt *)skb_network_header(skb); /* Don't modify or load balance ARPs that do not originate * from the bond itself or a VLAN directly above the bond. */ if (!bond_slave_has_mac_rcu(bond, arp->mac_src)) return NULL; dev = ip_dev_find(dev_net(bond->dev), arp->ip_src); if (dev) { if (netif_is_any_bridge_master(dev)) { dev_put(dev); return NULL; } dev_put(dev); } if (arp->op_code == htons(ARPOP_REPLY)) { /* the arp must be sent on the selected rx channel */ tx_slave = rlb_choose_channel(skb, bond, arp); if (tx_slave) bond_hw_addr_copy(arp->mac_src, tx_slave->dev->dev_addr, tx_slave->dev->addr_len); netdev_dbg(bond->dev, "(slave %s): Server sent ARP Reply packet\n", tx_slave ? tx_slave->dev->name : "NULL"); } else if (arp->op_code == htons(ARPOP_REQUEST)) { /* Create an entry in the rx_hashtbl for this client as a * place holder. * When the arp reply is received the entry will be updated * with the correct unicast address of the client. */ tx_slave = rlb_choose_channel(skb, bond, arp); /* The ARP reply packets must be delayed so that * they can cancel out the influence of the ARP request. */ bond->alb_info.rlb_update_delay_counter = RLB_UPDATE_DELAY; /* arp requests are broadcast and are sent on the primary * the arp request will collapse all clients on the subnet to * the primary slave. We must register these clients to be * updated with their assigned mac. */ rlb_req_update_subnet_clients(bond, arp->ip_src); netdev_dbg(bond->dev, "(slave %s): Server sent ARP Request packet\n", tx_slave ? tx_slave->dev->name : "NULL"); } return tx_slave; } static void rlb_rebalance(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *assigned_slave; struct rlb_client_info *client_info; int ntt; u32 hash_index; spin_lock_bh(&bond->mode_lock); ntt = 0; hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); assigned_slave = __rlb_next_rx_slave(bond); if (assigned_slave && (client_info->slave != assigned_slave)) { client_info->slave = assigned_slave; if (!is_zero_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; ntt = 1; } } } /* update the team's flag only after the whole iteration */ if (ntt) bond_info->rx_ntt = 1; spin_unlock_bh(&bond->mode_lock); } /* Caller must hold mode_lock */ static void rlb_init_table_entry_dst(struct rlb_client_info *entry) { entry->used_next = RLB_NULL_INDEX; entry->used_prev = RLB_NULL_INDEX; entry->assigned = 0; entry->slave = NULL; entry->vlan_id = 0; } static void rlb_init_table_entry_src(struct rlb_client_info *entry) { entry->src_first = RLB_NULL_INDEX; entry->src_prev = RLB_NULL_INDEX; entry->src_next = RLB_NULL_INDEX; } static void rlb_init_table_entry(struct rlb_client_info *entry) { memset(entry, 0, sizeof(struct rlb_client_info)); rlb_init_table_entry_dst(entry); rlb_init_table_entry_src(entry); } static void rlb_delete_table_entry_dst(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next_index = bond_info->rx_hashtbl[index].used_next; u32 prev_index = bond_info->rx_hashtbl[index].used_prev; if (index == bond_info->rx_hashtbl_used_head) bond_info->rx_hashtbl_used_head = next_index; if (prev_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[prev_index].used_next = next_index; if (next_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[next_index].used_prev = prev_index; } /* unlink a rlb hash table entry from the src list */ static void rlb_src_unlink(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next_index = bond_info->rx_hashtbl[index].src_next; u32 prev_index = bond_info->rx_hashtbl[index].src_prev; bond_info->rx_hashtbl[index].src_next = RLB_NULL_INDEX; bond_info->rx_hashtbl[index].src_prev = RLB_NULL_INDEX; if (next_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[next_index].src_prev = prev_index; if (prev_index == RLB_NULL_INDEX) return; /* is prev_index pointing to the head of this list? */ if (bond_info->rx_hashtbl[prev_index].src_first == index) bond_info->rx_hashtbl[prev_index].src_first = next_index; else bond_info->rx_hashtbl[prev_index].src_next = next_index; } static void rlb_delete_table_entry(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); rlb_delete_table_entry_dst(bond, index); rlb_init_table_entry_dst(entry); rlb_src_unlink(bond, index); } /* add the rx_hashtbl[ip_dst_hash] entry to the list * of entries with identical ip_src_hash */ static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next; bond_info->rx_hashtbl[ip_dst_hash].src_prev = ip_src_hash; next = bond_info->rx_hashtbl[ip_src_hash].src_first; bond_info->rx_hashtbl[ip_dst_hash].src_next = next; if (next != RLB_NULL_INDEX) bond_info->rx_hashtbl[next].src_prev = ip_dst_hash; bond_info->rx_hashtbl[ip_src_hash].src_first = ip_dst_hash; } /* deletes all rx_hashtbl entries with arp->ip_src if their mac_src does * not match arp->mac_src */ static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 ip_src_hash = _simple_hash((u8 *)&(arp->ip_src), sizeof(arp->ip_src)); u32 index; spin_lock_bh(&bond->mode_lock); index = bond_info->rx_hashtbl[ip_src_hash].src_first; while (index != RLB_NULL_INDEX) { struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); u32 next_index = entry->src_next; if (entry->ip_src == arp->ip_src && !ether_addr_equal_64bits(arp->mac_src, entry->mac_src)) rlb_delete_table_entry(bond, index); index = next_index; } spin_unlock_bh(&bond->mode_lock); } static int rlb_initialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *new_hashtbl; int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info); int i; new_hashtbl = kmalloc(size, GFP_KERNEL); if (!new_hashtbl) return -1; spin_lock_bh(&bond->mode_lock); bond_info->rx_hashtbl = new_hashtbl; bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) rlb_init_table_entry(bond_info->rx_hashtbl + i); spin_unlock_bh(&bond->mode_lock); /* register to receive ARPs */ bond->recv_probe = rlb_arp_recv; return 0; } static void rlb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); spin_lock_bh(&bond->mode_lock); kfree(bond_info->rx_hashtbl); bond_info->rx_hashtbl = NULL; bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; spin_unlock_bh(&bond->mode_lock); } static void rlb_clear_vlan(struct bonding *bond, unsigned short vlan_id) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 curr_index; spin_lock_bh(&bond->mode_lock); curr_index = bond_info->rx_hashtbl_used_head; while (curr_index != RLB_NULL_INDEX) { struct rlb_client_info *curr = &(bond_info->rx_hashtbl[curr_index]); u32 next_index = bond_info->rx_hashtbl[curr_index].used_next; if (curr->vlan_id == vlan_id) rlb_delete_table_entry(bond, curr_index); curr_index = next_index; } spin_unlock_bh(&bond->mode_lock); } /*********************** tlb/rlb shared functions *********************/ static void alb_send_lp_vid(struct slave *slave, const u8 mac_addr[], __be16 vlan_proto, u16 vid) { struct learning_pkt pkt; struct sk_buff *skb; int size = sizeof(struct learning_pkt); memset(&pkt, 0, size); ether_addr_copy(pkt.mac_dst, mac_addr); ether_addr_copy(pkt.mac_src, mac_addr); pkt.type = cpu_to_be16(ETH_P_LOOPBACK); skb = dev_alloc_skb(size); if (!skb) return; skb_put_data(skb, &pkt, size); skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = pkt.type; skb->priority = TC_PRIO_CONTROL; skb->dev = slave->dev; slave_dbg(slave->bond->dev, slave->dev, "Send learning packet: mac %pM vlan %d\n", mac_addr, vid); if (vid) __vlan_hwaccel_put_tag(skb, vlan_proto, vid); dev_queue_xmit(skb); } struct alb_walk_data { struct bonding *bond; struct slave *slave; const u8 *mac_addr; bool strict_match; }; static int alb_upper_dev_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct alb_walk_data *data = (struct alb_walk_data *)priv->data; bool strict_match = data->strict_match; const u8 *mac_addr = data->mac_addr; struct bonding *bond = data->bond; struct slave *slave = data->slave; struct bond_vlan_tag *tags; if (is_vlan_dev(upper) && bond->dev->lower_level == upper->lower_level - 1) { if (upper->addr_assign_type == NET_ADDR_STOLEN) { alb_send_lp_vid(slave, mac_addr, vlan_dev_vlan_proto(upper), vlan_dev_vlan_id(upper)); } else { alb_send_lp_vid(slave, upper->dev_addr, vlan_dev_vlan_proto(upper), vlan_dev_vlan_id(upper)); } } /* If this is a macvlan device, then only send updates * when strict_match is turned off. */ if (netif_is_macvlan(upper) && !strict_match) { tags = bond_verify_device_path(bond->dev, upper, 0); if (IS_ERR_OR_NULL(tags)) return -ENOMEM; alb_send_lp_vid(slave, upper->dev_addr, tags[0].vlan_proto, tags[0].vlan_id); kfree(tags); } return 0; } static void alb_send_learning_packets(struct slave *slave, const u8 mac_addr[], bool strict_match) { struct bonding *bond = bond_get_bond_by_slave(slave); struct netdev_nested_priv priv; struct alb_walk_data data = { .strict_match = strict_match, .mac_addr = mac_addr, .slave = slave, .bond = bond, }; priv.data = (void *)&data; /* send untagged */ alb_send_lp_vid(slave, mac_addr, 0, 0); /* loop through all devices and see if we need to send a packet * for that device. */ rcu_read_lock(); netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &priv); rcu_read_unlock(); } static int alb_set_slave_mac_addr(struct slave *slave, const u8 addr[], unsigned int len) { struct net_device *dev = slave->dev; struct sockaddr_storage ss; if (BOND_MODE(slave->bond) == BOND_MODE_TLB) { __dev_addr_set(dev, addr, len); return 0; } /* for rlb each slave must have a unique hw mac addresses so that * each slave will receive packets destined to a different mac */ memcpy(ss.__data, addr, len); ss.ss_family = dev->type; if (dev_set_mac_address(dev, &ss, NULL)) { slave_err(slave->bond->dev, dev, "dev_set_mac_address on slave failed! ALB mode requires that the base driver support setting the hw address also when the network device's interface is open\n"); return -EOPNOTSUPP; } return 0; } /* Swap MAC addresses between two slaves. * * Called with RTNL held, and no other locks. */ static void alb_swap_mac_addr(struct slave *slave1, struct slave *slave2) { u8 tmp_mac_addr[MAX_ADDR_LEN]; bond_hw_addr_copy(tmp_mac_addr, slave1->dev->dev_addr, slave1->dev->addr_len); alb_set_slave_mac_addr(slave1, slave2->dev->dev_addr, slave2->dev->addr_len); alb_set_slave_mac_addr(slave2, tmp_mac_addr, slave1->dev->addr_len); } /* Send learning packets after MAC address swap. * * Called with RTNL and no other locks */ static void alb_fasten_mac_swap(struct bonding *bond, struct slave *slave1, struct slave *slave2) { int slaves_state_differ = (bond_slave_can_tx(slave1) != bond_slave_can_tx(slave2)); struct slave *disabled_slave = NULL; ASSERT_RTNL(); /* fasten the change in the switch */ if (bond_slave_can_tx(slave1)) { alb_send_learning_packets(slave1, slave1->dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform the clients that the mac address * has changed */ rlb_req_update_slave_clients(bond, slave1); } } else { disabled_slave = slave1; } if (bond_slave_can_tx(slave2)) { alb_send_learning_packets(slave2, slave2->dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform the clients that the mac address * has changed */ rlb_req_update_slave_clients(bond, slave2); } } else { disabled_slave = slave2; } if (bond->alb_info.rlb_enabled && slaves_state_differ) { /* A disabled slave was assigned an active mac addr */ rlb_teach_disabled_mac_on_primary(bond, disabled_slave->dev->dev_addr); } } /** * alb_change_hw_addr_on_detach * @bond: bonding we're working on * @slave: the slave that was just detached * * We assume that @slave was already detached from the slave list. * * If @slave's permanent hw address is different both from its current * address and from @bond's address, then somewhere in the bond there's * a slave that has @slave's permanet address as its current address. * We'll make sure that slave no longer uses @slave's permanent address. * * Caller must hold RTNL and no other locks */ static void alb_change_hw_addr_on_detach(struct bonding *bond, struct slave *slave) { int perm_curr_diff; int perm_bond_diff; struct slave *found_slave; perm_curr_diff = !ether_addr_equal_64bits(slave->perm_hwaddr, slave->dev->dev_addr); perm_bond_diff = !ether_addr_equal_64bits(slave->perm_hwaddr, bond->dev->dev_addr); if (perm_curr_diff && perm_bond_diff) { found_slave = bond_slave_has_mac(bond, slave->perm_hwaddr); if (found_slave) { alb_swap_mac_addr(slave, found_slave); alb_fasten_mac_swap(bond, slave, found_slave); } } } /** * alb_handle_addr_collision_on_attach * @bond: bonding we're working on * @slave: the slave that was just attached * * checks uniqueness of slave's mac address and handles the case the * new slave uses the bonds mac address. * * If the permanent hw address of @slave is @bond's hw address, we need to * find a different hw address to give @slave, that isn't in use by any other * slave in the bond. This address must be, of course, one of the permanent * addresses of the other slaves. * * We go over the slave list, and for each slave there we compare its * permanent hw address with the current address of all the other slaves. * If no match was found, then we've found a slave with a permanent address * that isn't used by any other slave in the bond, so we can assign it to * @slave. * * assumption: this function is called before @slave is attached to the * bond slave list. */ static int alb_handle_addr_collision_on_attach(struct bonding *bond, struct slave *slave) { struct slave *has_bond_addr = rcu_access_pointer(bond->curr_active_slave); struct slave *tmp_slave1, *free_mac_slave = NULL; struct list_head *iter; if (!bond_has_slaves(bond)) { /* this is the first slave */ return 0; } /* if slave's mac address differs from bond's mac address * check uniqueness of slave's mac address against the other * slaves in the bond. */ if (!ether_addr_equal_64bits(slave->perm_hwaddr, bond->dev->dev_addr)) { if (!bond_slave_has_mac(bond, slave->dev->dev_addr)) return 0; /* Try setting slave mac to bond address and fall-through * to code handling that situation below... */ alb_set_slave_mac_addr(slave, bond->dev->dev_addr, bond->dev->addr_len); } /* The slave's address is equal to the address of the bond. * Search for a spare address in the bond for this slave. */ bond_for_each_slave(bond, tmp_slave1, iter) { if (!bond_slave_has_mac(bond, tmp_slave1->perm_hwaddr)) { /* no slave has tmp_slave1's perm addr * as its curr addr */ free_mac_slave = tmp_slave1; break; } if (!has_bond_addr) { if (ether_addr_equal_64bits(tmp_slave1->dev->dev_addr, bond->dev->dev_addr)) { has_bond_addr = tmp_slave1; } } } if (free_mac_slave) { alb_set_slave_mac_addr(slave, free_mac_slave->perm_hwaddr, free_mac_slave->dev->addr_len); slave_warn(bond->dev, slave->dev, "the slave hw address is in use by the bond; giving it the hw address of %s\n", free_mac_slave->dev->name); } else if (has_bond_addr) { slave_err(bond->dev, slave->dev, "the slave hw address is in use by the bond; couldn't find a slave with a free hw address to give it (this should not have happened)\n"); return -EFAULT; } return 0; } /** * alb_set_mac_address * @bond: bonding we're working on * @addr: MAC address to set * * In TLB mode all slaves are configured to the bond's hw address, but set * their dev_addr field to different addresses (based on their permanent hw * addresses). * * For each slave, this function sets the interface to the new address and then * changes its dev_addr field to its previous value. * * Unwinding assumes bond's mac address has not yet changed. */ static int alb_set_mac_address(struct bonding *bond, void *addr) { struct slave *slave, *rollback_slave; struct list_head *iter; struct sockaddr_storage ss; char tmp_addr[MAX_ADDR_LEN]; int res; if (bond->alb_info.rlb_enabled) return 0; bond_for_each_slave(bond, slave, iter) { /* save net_device's current hw address */ bond_hw_addr_copy(tmp_addr, slave->dev->dev_addr, slave->dev->addr_len); res = dev_set_mac_address(slave->dev, addr, NULL); /* restore net_device's hw address */ dev_addr_set(slave->dev, tmp_addr); if (res) goto unwind; } return 0; unwind: memcpy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { if (rollback_slave == slave) break; bond_hw_addr_copy(tmp_addr, rollback_slave->dev->dev_addr, rollback_slave->dev->addr_len); dev_set_mac_address(rollback_slave->dev, &ss, NULL); dev_addr_set(rollback_slave->dev, tmp_addr); } return res; } /* determine if the packet is NA or NS */ static bool alb_determine_nd(struct sk_buff *skb, struct bonding *bond) { struct ipv6hdr *ip6hdr; struct icmp6hdr *hdr; if (!pskb_network_may_pull(skb, sizeof(*ip6hdr))) return true; ip6hdr = ipv6_hdr(skb); if (ip6hdr->nexthdr != IPPROTO_ICMPV6) return false; if (!pskb_network_may_pull(skb, sizeof(*ip6hdr) + sizeof(*hdr))) return true; hdr = icmp6_hdr(skb); return hdr->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT || hdr->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION; } /************************ exported alb functions ************************/ int bond_alb_initialize(struct bonding *bond, int rlb_enabled) { int res; res = tlb_initialize(bond); if (res) return res; if (rlb_enabled) { res = rlb_initialize(bond); if (res) { tlb_deinitialize(bond); return res; } bond->alb_info.rlb_enabled = 1; } else { bond->alb_info.rlb_enabled = 0; } return 0; } void bond_alb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); tlb_deinitialize(bond); if (bond_info->rlb_enabled) rlb_deinitialize(bond); } static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond, struct slave *tx_slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct ethhdr *eth_data = eth_hdr(skb); if (!tx_slave) { /* unbalanced or unassigned, send through primary */ tx_slave = rcu_dereference(bond->curr_active_slave); if (bond->params.tlb_dynamic_lb) bond_info->unbalanced_load += skb->len; } if (tx_slave && bond_slave_can_tx(tx_slave)) { if (tx_slave != rcu_access_pointer(bond->curr_active_slave)) { ether_addr_copy(eth_data->h_source, tx_slave->dev->dev_addr); } return bond_dev_queue_xmit(bond, skb, tx_slave->dev); } if (tx_slave && bond->params.tlb_dynamic_lb) { spin_lock(&bond->mode_lock); __tlb_clear_slave(bond, tx_slave, 0); spin_unlock(&bond->mode_lock); } /* no suitable interface, frame not sent */ return bond_tx_drop(bond->dev, skb); } struct slave *bond_xmit_tlb_slave_get(struct bonding *bond, struct sk_buff *skb) { struct slave *tx_slave = NULL; struct ethhdr *eth_data; u32 hash_index; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); /* Do not TX balance any multicast or broadcast */ if (!is_multicast_ether_addr(eth_data->h_dest)) { switch (skb->protocol) { case htons(ETH_P_IPV6): if (alb_determine_nd(skb, bond)) break; fallthrough; case htons(ETH_P_IP): hash_index = bond_xmit_hash(bond, skb); if (bond->params.tlb_dynamic_lb) { tx_slave = tlb_choose_channel(bond, hash_index & 0xFF, skb->len); } else { struct bond_up_slave *slaves; unsigned int count; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[hash_index % count]; } break; } } return tx_slave; } netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *tx_slave; tx_slave = bond_xmit_tlb_slave_get(bond, skb); return bond_do_alb_xmit(skb, bond, tx_slave); } struct slave *bond_xmit_alb_slave_get(struct bonding *bond, struct sk_buff *skb) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); static const __be32 ip_bcast = htonl(0xffffffff); struct slave *tx_slave = NULL; const u8 *hash_start = NULL; bool do_tx_balance = true; struct ethhdr *eth_data; u32 hash_index = 0; int hash_size = 0; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); switch (ntohs(skb->protocol)) { case ETH_P_IP: { const struct iphdr *iph; if (is_broadcast_ether_addr(eth_data->h_dest) || !pskb_network_may_pull(skb, sizeof(*iph))) { do_tx_balance = false; break; } iph = ip_hdr(skb); if (iph->daddr == ip_bcast || iph->protocol == IPPROTO_IGMP) { do_tx_balance = false; break; } hash_start = (char *)&(iph->daddr); hash_size = sizeof(iph->daddr); break; } case ETH_P_IPV6: { const struct ipv6hdr *ip6hdr; /* IPv6 doesn't really use broadcast mac address, but leave * that here just in case. */ if (is_broadcast_ether_addr(eth_data->h_dest)) { do_tx_balance = false; break; } /* IPv6 uses all-nodes multicast as an equivalent to * broadcasts in IPv4. */ if (ether_addr_equal_64bits(eth_data->h_dest, mac_v6_allmcast)) { do_tx_balance = false; break; } if (alb_determine_nd(skb, bond)) { do_tx_balance = false; break; } /* The IPv6 header is pulled by alb_determine_nd */ /* Additionally, DAD probes should not be tx-balanced as that * will lead to false positives for duplicate addresses and * prevent address configuration from working. */ ip6hdr = ipv6_hdr(skb); if (ipv6_addr_any(&ip6hdr->saddr)) { do_tx_balance = false; break; } hash_start = (char *)&ip6hdr->daddr; hash_size = sizeof(ip6hdr->daddr); break; } case ETH_P_ARP: do_tx_balance = false; if (bond_info->rlb_enabled) tx_slave = rlb_arp_xmit(skb, bond); break; default: do_tx_balance = false; break; } if (do_tx_balance) { if (bond->params.tlb_dynamic_lb) { hash_index = _simple_hash(hash_start, hash_size); tx_slave = tlb_choose_channel(bond, hash_index, skb->len); } else { /* * do_tx_balance means we are free to select the tx_slave * So we do exactly what tlb would do for hash selection */ struct bond_up_slave *slaves; unsigned int count; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[bond_xmit_hash(bond, skb) % count]; } } return tx_slave; } netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *tx_slave = NULL; tx_slave = bond_xmit_alb_slave_get(bond, skb); return bond_do_alb_xmit(skb, bond, tx_slave); } void bond_alb_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, alb_work.work); struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct list_head *iter; struct slave *slave; if (!bond_has_slaves(bond)) { atomic_set(&bond_info->tx_rebalance_counter, 0); bond_info->lp_counter = 0; goto re_arm; } rcu_read_lock(); atomic_inc(&bond_info->tx_rebalance_counter); bond_info->lp_counter++; /* send learning packets */ if (bond_info->lp_counter >= BOND_ALB_LP_TICKS(bond)) { bool strict_match; bond_for_each_slave_rcu(bond, slave, iter) { /* If updating current_active, use all currently * user mac addresses (!strict_match). Otherwise, only * use mac of the slave device. * In RLB mode, we always use strict matches. */ strict_match = (slave != rcu_access_pointer(bond->curr_active_slave) || bond_info->rlb_enabled); alb_send_learning_packets(slave, slave->dev->dev_addr, strict_match); } bond_info->lp_counter = 0; } /* rebalance tx traffic */ if (atomic_read(&bond_info->tx_rebalance_counter) >= BOND_TLB_REBALANCE_TICKS) { bond_for_each_slave_rcu(bond, slave, iter) { tlb_clear_slave(bond, slave, 1); if (slave == rcu_access_pointer(bond->curr_active_slave)) { SLAVE_TLB_INFO(slave).load = bond_info->unbalanced_load / BOND_TLB_REBALANCE_INTERVAL; bond_info->unbalanced_load = 0; } } atomic_set(&bond_info->tx_rebalance_counter, 0); } if (bond_info->rlb_enabled) { if (bond_info->primary_is_promisc && (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) { /* dev_set_promiscuity requires rtnl and * nothing else. Avoid race with bond_close. */ rcu_read_unlock(); if (!rtnl_trylock()) goto re_arm; bond_info->rlb_promisc_timeout_counter = 0; /* If the primary was set to promiscuous mode * because a slave was disabled then * it can now leave promiscuous mode. */ dev_set_promiscuity(rtnl_dereference(bond->curr_active_slave)->dev, -1); bond_info->primary_is_promisc = 0; rtnl_unlock(); rcu_read_lock(); } if (bond_info->rlb_rebalance) { bond_info->rlb_rebalance = 0; rlb_rebalance(bond); } /* check if clients need updating */ if (bond_info->rx_ntt) { if (bond_info->rlb_update_delay_counter) { --bond_info->rlb_update_delay_counter; } else { rlb_update_rx_clients(bond); if (bond_info->rlb_update_retry_counter) --bond_info->rlb_update_retry_counter; else bond_info->rx_ntt = 0; } } } rcu_read_unlock(); re_arm: queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks); } /* assumption: called before the slave is attached to the bond * and not locked by the bond lock */ int bond_alb_init_slave(struct bonding *bond, struct slave *slave) { int res; res = alb_set_slave_mac_addr(slave, slave->perm_hwaddr, slave->dev->addr_len); if (res) return res; res = alb_handle_addr_collision_on_attach(bond, slave); if (res) return res; tlb_init_slave(slave); /* order a rebalance ASAP */ atomic_set(&bond->alb_info.tx_rebalance_counter, BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) bond->alb_info.rlb_rebalance = 1; return 0; } /* Remove slave from tlb and rlb hash tables, and fix up MAC addresses * if necessary. * * Caller must hold RTNL and no other locks */ void bond_alb_deinit_slave(struct bonding *bond, struct slave *slave) { if (bond_has_slaves(bond)) alb_change_hw_addr_on_detach(bond, slave); tlb_clear_slave(bond, slave, 0); if (bond->alb_info.rlb_enabled) { bond->alb_info.rx_slave = NULL; rlb_clear_slave(bond, slave); } } void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char link) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); if (link == BOND_LINK_DOWN) { tlb_clear_slave(bond, slave, 0); if (bond->alb_info.rlb_enabled) rlb_clear_slave(bond, slave); } else if (link == BOND_LINK_UP) { /* order a rebalance ASAP */ atomic_set(&bond_info->tx_rebalance_counter, BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) { bond->alb_info.rlb_rebalance = 1; /* If the updelay module parameter is smaller than the * forwarding delay of the switch the rebalance will * not work because the rebalance arp replies will * not be forwarded to the clients.. */ } } if (bond_is_nondyn_tlb(bond)) { if (bond_update_slave_arr(bond, NULL)) pr_err("Failed to build slave-array for TLB mode.\n"); } } /** * bond_alb_handle_active_change - assign new curr_active_slave * @bond: our bonding struct * @new_slave: new slave to assign * * Set the bond->curr_active_slave to @new_slave and handle * mac address swapping and promiscuity changes as needed. * * Caller must hold RTNL */ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave) { struct slave *swap_slave; struct slave *curr_active; curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active == new_slave) return; if (curr_active && bond->alb_info.primary_is_promisc) { dev_set_promiscuity(curr_active->dev, -1); bond->alb_info.primary_is_promisc = 0; bond->alb_info.rlb_promisc_timeout_counter = 0; } swap_slave = curr_active; rcu_assign_pointer(bond->curr_active_slave, new_slave); if (!new_slave || !bond_has_slaves(bond)) return; /* set the new curr_active_slave to the bonds mac address * i.e. swap mac addresses of old curr_active_slave and new curr_active_slave */ if (!swap_slave) swap_slave = bond_slave_has_mac(bond, bond->dev->dev_addr); /* Arrange for swap_slave and new_slave to temporarily be * ignored so we can mess with their MAC addresses without * fear of interference from transmit activity. */ if (swap_slave) tlb_clear_slave(bond, swap_slave, 1); tlb_clear_slave(bond, new_slave, 1); /* in TLB mode, the slave might flip down/up with the old dev_addr, * and thus filter bond->dev_addr's packets, so force bond's mac */ if (BOND_MODE(bond) == BOND_MODE_TLB) { struct sockaddr_storage ss; u8 tmp_addr[MAX_ADDR_LEN]; bond_hw_addr_copy(tmp_addr, new_slave->dev->dev_addr, new_slave->dev->addr_len); bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; /* we don't care if it can't change its mac, best effort */ dev_set_mac_address(new_slave->dev, &ss, NULL); dev_addr_set(new_slave->dev, tmp_addr); } /* curr_active_slave must be set before calling alb_swap_mac_addr */ if (swap_slave) { /* swap mac address */ alb_swap_mac_addr(swap_slave, new_slave); alb_fasten_mac_swap(bond, swap_slave, new_slave); } else { /* set the new_slave to the bond mac address */ alb_set_slave_mac_addr(new_slave, bond->dev->dev_addr, bond->dev->addr_len); alb_send_learning_packets(new_slave, bond->dev->dev_addr, false); } } /* Called with RTNL */ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr) { struct bonding *bond = netdev_priv(bond_dev); struct sockaddr_storage *ss = addr; struct slave *curr_active; struct slave *swap_slave; int res; if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL; res = alb_set_mac_address(bond, addr); if (res) return res; dev_addr_set(bond_dev, ss->__data); /* If there is no curr_active_slave there is nothing else to do. * Otherwise we'll need to pass the new address to it and handle * duplications. */ curr_active = rtnl_dereference(bond->curr_active_slave); if (!curr_active) return 0; swap_slave = bond_slave_has_mac(bond, bond_dev->dev_addr); if (swap_slave) { alb_swap_mac_addr(swap_slave, curr_active); alb_fasten_mac_swap(bond, swap_slave, curr_active); } else { alb_set_slave_mac_addr(curr_active, bond_dev->dev_addr, bond_dev->addr_len); alb_send_learning_packets(curr_active, bond_dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform clients mac address has changed */ rlb_req_update_slave_clients(bond, curr_active); } } return 0; } void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id) { if (bond->alb_info.rlb_enabled) rlb_clear_vlan(bond, vlan_id); } |
| 5 161 162 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | /* * Copyright (C) 2011-2013 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef DRM_RECT_H #define DRM_RECT_H #include <linux/types.h> /** * DOC: rect utils * * Utility functions to help manage rectangular areas for * clipping, scaling, etc. calculations. */ /** * struct drm_rect - two dimensional rectangle * @x1: horizontal starting coordinate (inclusive) * @x2: horizontal ending coordinate (exclusive) * @y1: vertical starting coordinate (inclusive) * @y2: vertical ending coordinate (exclusive) * * Note that this must match the layout of struct drm_mode_rect or the damage * helpers like drm_atomic_helper_damage_iter_init() break. */ struct drm_rect { int x1, y1, x2, y2; }; /** * DRM_RECT_INIT - initialize a rectangle from x/y/w/h * @x: x coordinate * @y: y coordinate * @w: width * @h: height * * RETURNS: * A new rectangle of the specified size. */ #define DRM_RECT_INIT(x, y, w, h) ((struct drm_rect){ \ .x1 = (x), \ .y1 = (y), \ .x2 = (x) + (w), \ .y2 = (y) + (h) }) /** * DRM_RECT_FMT - printf string for &struct drm_rect */ #define DRM_RECT_FMT "%dx%d%+d%+d" /** * DRM_RECT_ARG - printf arguments for &struct drm_rect * @r: rectangle struct */ #define DRM_RECT_ARG(r) drm_rect_width(r), drm_rect_height(r), (r)->x1, (r)->y1 /** * DRM_RECT_FP_FMT - printf string for &struct drm_rect in 16.16 fixed point */ #define DRM_RECT_FP_FMT "%d.%06ux%d.%06u%+d.%06u%+d.%06u" /** * DRM_RECT_FP_ARG - printf arguments for &struct drm_rect in 16.16 fixed point * @r: rectangle struct * * This is useful for e.g. printing plane source rectangles, which are in 16.16 * fixed point. */ #define DRM_RECT_FP_ARG(r) \ drm_rect_width(r) >> 16, ((drm_rect_width(r) & 0xffff) * 15625) >> 10, \ drm_rect_height(r) >> 16, ((drm_rect_height(r) & 0xffff) * 15625) >> 10, \ (r)->x1 >> 16, (((r)->x1 & 0xffff) * 15625) >> 10, \ (r)->y1 >> 16, (((r)->y1 & 0xffff) * 15625) >> 10 /** * drm_rect_init - initialize the rectangle from x/y/w/h * @r: rectangle * @x: x coordinate * @y: y coordinate * @width: width * @height: height */ static inline void drm_rect_init(struct drm_rect *r, int x, int y, int width, int height) { r->x1 = x; r->y1 = y; r->x2 = x + width; r->y2 = y + height; } /** * drm_rect_adjust_size - adjust the size of the rectangle * @r: rectangle to be adjusted * @dw: horizontal adjustment * @dh: vertical adjustment * * Change the size of rectangle @r by @dw in the horizontal direction, * and by @dh in the vertical direction, while keeping the center * of @r stationary. * * Positive @dw and @dh increase the size, negative values decrease it. */ static inline void drm_rect_adjust_size(struct drm_rect *r, int dw, int dh) { r->x1 -= dw >> 1; r->y1 -= dh >> 1; r->x2 += (dw + 1) >> 1; r->y2 += (dh + 1) >> 1; } /** * drm_rect_translate - translate the rectangle * @r: rectangle to be translated * @dx: horizontal translation * @dy: vertical translation * * Move rectangle @r by @dx in the horizontal direction, * and by @dy in the vertical direction. */ static inline void drm_rect_translate(struct drm_rect *r, int dx, int dy) { r->x1 += dx; r->y1 += dy; r->x2 += dx; r->y2 += dy; } /** * drm_rect_translate_to - translate the rectangle to an absolute position * @r: rectangle to be translated * @x: horizontal position * @y: vertical position * * Move rectangle @r to @x in the horizontal direction, * and to @y in the vertical direction. */ static inline void drm_rect_translate_to(struct drm_rect *r, int x, int y) { drm_rect_translate(r, x - r->x1, y - r->y1); } /** * drm_rect_downscale - downscale a rectangle * @r: rectangle to be downscaled * @horz: horizontal downscale factor * @vert: vertical downscale factor * * Divide the coordinates of rectangle @r by @horz and @vert. */ static inline void drm_rect_downscale(struct drm_rect *r, int horz, int vert) { r->x1 /= horz; r->y1 /= vert; r->x2 /= horz; r->y2 /= vert; } /** * drm_rect_width - determine the rectangle width * @r: rectangle whose width is returned * * RETURNS: * The width of the rectangle. */ static inline int drm_rect_width(const struct drm_rect *r) { return r->x2 - r->x1; } /** * drm_rect_height - determine the rectangle height * @r: rectangle whose height is returned * * RETURNS: * The height of the rectangle. */ static inline int drm_rect_height(const struct drm_rect *r) { return r->y2 - r->y1; } /** * drm_rect_visible - determine if the rectangle is visible * @r: rectangle whose visibility is returned * * RETURNS: * %true if the rectangle is visible, %false otherwise. */ static inline bool drm_rect_visible(const struct drm_rect *r) { return drm_rect_width(r) > 0 && drm_rect_height(r) > 0; } /** * drm_rect_equals - determine if two rectangles are equal * @r1: first rectangle * @r2: second rectangle * * RETURNS: * %true if the rectangles are equal, %false otherwise. */ static inline bool drm_rect_equals(const struct drm_rect *r1, const struct drm_rect *r2) { return r1->x1 == r2->x1 && r1->x2 == r2->x2 && r1->y1 == r2->y1 && r1->y2 == r2->y2; } /** * drm_rect_fp_to_int - Convert a rect in 16.16 fixed point form to int form. * @dst: rect to be stored the converted value * @src: rect in 16.16 fixed point form */ static inline void drm_rect_fp_to_int(struct drm_rect *dst, const struct drm_rect *src) { drm_rect_init(dst, src->x1 >> 16, src->y1 >> 16, drm_rect_width(src) >> 16, drm_rect_height(src) >> 16); } /** * drm_rect_overlap - Check if two rectangles overlap * @a: first rectangle * @b: second rectangle * * RETURNS: * %true if the rectangles overlap, %false otherwise. */ static inline bool drm_rect_overlap(const struct drm_rect *a, const struct drm_rect *b) { return (a->x2 > b->x1 && b->x2 > a->x1 && a->y2 > b->y1 && b->y2 > a->y1); } bool drm_rect_intersect(struct drm_rect *r, const struct drm_rect *clip); bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst, const struct drm_rect *clip); int drm_rect_calc_hscale(const struct drm_rect *src, const struct drm_rect *dst, int min_hscale, int max_hscale); int drm_rect_calc_vscale(const struct drm_rect *src, const struct drm_rect *dst, int min_vscale, int max_vscale); void drm_rect_debug_print(const char *prefix, const struct drm_rect *r, bool fixed_point); void drm_rect_rotate(struct drm_rect *r, int width, int height, unsigned int rotation); void drm_rect_rotate_inv(struct drm_rect *r, int width, int height, unsigned int rotation); #endif |
| 549 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BTREE_H__ #define __XFS_BTREE_H__ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_ifork; struct xfs_perag; /* * Generic key, ptr and record wrapper structures. * * These are disk format structures, and are converted where necessary * by the btree specific code that needs to interpret them. */ union xfs_btree_ptr { __be32 s; /* short form ptr */ __be64 l; /* long form ptr */ }; /* * The in-core btree key. Overlapping btrees actually store two keys * per pointer, so we reserve enough memory to hold both. The __*bigkey * items should never be accessed directly. */ union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; struct xfs_rmap_key rmap; struct xfs_rmap_key __rmap_bigkey[2]; struct xfs_refcount_key refc; }; union xfs_btree_rec { struct xfs_bmbt_rec bmbt; xfs_bmdr_rec_t bmbr; /* bmbt root block */ struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; struct xfs_refcount_rec refc; }; /* * This nonsense is to make -wlint happy. */ #define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) #define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) #define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) struct xfs_btree_ops; uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); /* * For logging record fields. */ #define XFS_BB_MAGIC (1u << 0) #define XFS_BB_LEVEL (1u << 1) #define XFS_BB_NUMRECS (1u << 2) #define XFS_BB_LEFTSIB (1u << 3) #define XFS_BB_RIGHTSIB (1u << 4) #define XFS_BB_BLKNO (1u << 5) #define XFS_BB_LSN (1u << 6) #define XFS_BB_UUID (1u << 7) #define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 #define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface */ #define XFS_BTREE_STATS_INC(cur, stat) \ XFS_STATS_INC_OFF((cur)->bc_mp, \ (cur)->bc_ops->statoff + __XBTS_ ## stat) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, \ (cur)->bc_ops->statoff + __XBTS_ ## stat, val) enum xbtree_key_contig { XBTREE_KEY_GAP = 0, XBTREE_KEY_CONTIGUOUS, XBTREE_KEY_OVERLAP, }; /* * Decide if these two numeric btree key fields are contiguous, overlapping, * or if there's a gap between them. @x should be the field from the high * key and @y should be the field from the low key. */ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) { x++; if (x < y) return XBTREE_KEY_GAP; if (x == y) return XBTREE_KEY_CONTIGUOUS; return XBTREE_KEY_OVERLAP; } #define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64)) #define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32)) enum xfs_btree_type { XFS_BTREE_TYPE_AG, XFS_BTREE_TYPE_INODE, XFS_BTREE_TYPE_MEM, }; struct xfs_btree_ops { const char *name; /* Type of btree - AG-rooted or inode-rooted */ enum xfs_btree_type type; /* XFS_BTGEO_* flags that determine the geometry of the btree */ unsigned int geom_flags; /* size of the key, pointer, and record structures */ size_t key_len; size_t ptr_len; size_t rec_len; /* LRU refcount to set on each btree buffer created */ unsigned int lru_refs; /* offset of btree stats array */ unsigned int statoff; /* sick mask for health reporting (not for bmap btrees) */ unsigned int sick_mask; /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); void (*update_cursor)(struct xfs_btree_cur *src, struct xfs_btree_cur *dst); /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, const union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, const union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* records in block/level */ int (*get_minrecs)(struct xfs_btree_cur *cur, int level); int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); /* records on disk. Matter for the root in inode case. */ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); void (*init_high_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); /* * Compare key value and cursor value -- positive if key > cur, * negative if key < cur, and zero if equal. */ int (*cmp_key_with_cur)(struct xfs_btree_cur *cur, const union xfs_btree_key *key); /* * Compare key1 and key2 -- positive if key1 > key2, negative if * key1 < key2, and zero if equal. If the @mask parameter is non NULL, * each key field to be used in the comparison must contain a nonzero * value. */ int (*cmp_two_keys)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2); /* check that r1 is lower than r2 */ int (*recs_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2); /* * Are these two btree keys immediately adjacent? * * Given two btree keys @key1 and @key2, decide if it is impossible for * there to be a third btree key K satisfying the relationship * @key1 < K < @key2. To determine if two btree records are * immediately adjacent, @key1 should be the high key of the first * record and @key2 should be the low key of the second record. * If the @mask parameter is non NULL, each key field to be used in the * comparison must contain a nonzero value. */ enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); /* * Reallocate the space for if_broot to fit the number of records. * Move the records and pointers in if_broot to fit the new size. When * shrinking this will eliminate holes between the records and pointers * created by the caller. When growing this will create holes to be * filled in by the caller. * * The caller must not request to add more records than would fit in * the on-disk inode root. If the if_broot is currently NULL, then if * we are adding records, one will be allocated. The caller must also * not request that the number of records go below zero, although it * can go to zero. */ struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur, unsigned int new_numrecs); }; /* btree geometry flags */ #define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ #define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */ union xfs_btree_irec { struct xfs_alloc_rec_incore a; struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; struct xfs_refcount_irec rc; }; struct xfs_btree_level { /* buffer pointer */ struct xfs_buf *bp; /* key/record number */ uint16_t ptr; /* readahead info */ #define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */ #define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */ uint16_t ra; }; /* * Btree cursor structure. * This collects all information needed by the btree code in one place. */ struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; struct kmem_cache *bc_cache; /* cursor cache */ unsigned int bc_flags; /* btree features - below */ union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ struct xfs_group *bc_group; /* per-type information */ union { struct { struct xfs_inode *ip; short forksize; char whichfork; struct xbtree_ifakeroot *ifake; /* for staging cursor */ } bc_ino; struct { struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; struct { struct xfbtree *xfbtree; } bc_mem; }; /* per-format private data */ union { struct { int allocated; } bc_bmap; /* bmapbt */ struct { unsigned int nr_ops; /* # record updates */ unsigned int shape_changes; /* # of extent splits */ } bc_refc; /* refcountbt/rtrefcountbt */ }; /* Must be at the end of the struct! */ struct xfs_btree_level bc_levels[]; }; /* * Compute the size of a btree cursor that can handle a btree of a given * height. The bc_levels array handles node and leaf blocks, so its size * is exactly nlevels. */ static inline size_t xfs_btree_cur_sizeof(unsigned int nlevels) { return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } /* cursor state flags */ /* * The root of this btree is a fakeroot structure so that we can stage a btree * rebuild without leaving it accessible via primary metadata. The ops struct * is dynamically allocated and must be freed when the cursor is deleted. */ #define XFS_BTREE_STAGING (1U << 0) /* We are converting a delalloc reservation (only for bmbt btrees) */ #define XFS_BTREE_BMBT_WASDEL (1U << 1) /* For extent swap, ignore owner check in verifier (only for bmbt btrees) */ #define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2) /* Cursor is active (only for allocbt btrees) */ #define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3) #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 /* * Convert from buffer to btree block header. */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); int __xfs_btree_check_ptr(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level); /* * Check that block header is ok. */ int xfs_btree_check_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp); /* buffer containing block, if any */ /* * Delete the btree cursor. */ void xfs_btree_del_cursor( struct xfs_btree_cur *cur, /* btree cursor */ int error); /* del because of error */ /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ int /* error */ xfs_btree_dup_cursor( struct xfs_btree_cur *cur, /* input cursor */ struct xfs_btree_cur **ncur);/* output cursor */ /* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ void xfs_btree_offsets( uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last); /* output: last byte offset */ /* * Initialise a new btree block header */ void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner); void xfs_btree_init_block(struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner); /* * Common btree core entry points. */ int xfs_btree_increment(struct xfs_btree_cur *, int, int *); int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, struct list_head *buffer_list); /* * btree block CRC helpers */ void xfs_btree_fsblock_calc_crc(struct xfs_buf *); bool xfs_btree_fsblock_verify_crc(struct xfs_buf *); void xfs_btree_agblock_calc_crc(struct xfs_buf *); bool xfs_btree_agblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* * Helpers. */ static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_numrecs); } static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, uint16_t numrecs) { block->bb_numrecs = cpu_to_be16(numrecs); } static inline int xfs_btree_get_level(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_level); } /* * Min and max functions for extlen, agblock, fileoff, and filblks types. */ #define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) #define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) #define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) #define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) #define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) #define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp); xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp, unsigned int max_recs); xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, unsigned int max_recs); xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp, unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, unsigned long long records); unsigned long long xfs_btree_calc_size(const unsigned int *limits, unsigned long long records); unsigned int xfs_btree_space_to_height(const unsigned int *limits, unsigned long long blocks); /* * Return codes for the query range iterator function are 0 to continue * iterating, and non-zero to stop iterating. Any non-zero value will be * passed up to the _query_range caller. The special value -ECANCELED can be * used to stop iteration, because _query_range never generates that error * code on its own. */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv); int xfs_btree_query_range(struct xfs_btree_cur *cur, const union xfs_btree_irec *low_rec, const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv); typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); /* Visit record blocks. */ #define XFS_BTREE_VISIT_RECORDS (1 << 0) /* Visit leaf blocks. */ #define XFS_BTREE_VISIT_LEAVES (1 << 1) /* Visit all blocks. */ #define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \ XFS_BTREE_VISIT_LEAVES) int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr); int xfs_btree_cmp_two_ptrs(struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b); void xfs_btree_get_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr); void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2); int xfs_btree_has_records(struct xfs_btree_cur *cur, const union xfs_btree_irec *low, const union xfs_btree_irec *high, const union xfs_btree_key *mask, enum xbtree_recpacking *outcome); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Key comparison helpers */ static inline bool xfs_btree_keycmp_lt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) < 0; } static inline bool xfs_btree_keycmp_gt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) > 0; } static inline bool xfs_btree_keycmp_eq( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) == 0; } static inline bool xfs_btree_keycmp_le( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_gt(cur, key1, key2); } static inline bool xfs_btree_keycmp_ge( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_lt(cur, key1, key2); } static inline bool xfs_btree_keycmp_ne( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_eq(cur, key1, key2); } /* Masked key comparison helpers */ static inline bool xfs_btree_masked_keycmp_lt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) < 0; } static inline bool xfs_btree_masked_keycmp_gt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) > 0; } static inline bool xfs_btree_masked_keycmp_ge( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask); } /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( struct xfs_btree_cur *cur, int level) { struct xfs_btree_block *block; struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, int numrecs); void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, const union xfs_btree_ptr *src_ptr, int numptrs); void xfs_btree_copy_keys(struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys); void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); static inline struct xfs_btree_cur * xfs_btree_alloc_cursor( struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_btree_ops *ops, uint8_t maxlevels, struct kmem_cache *cache) { struct xfs_btree_cur *cur; ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN || ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN); /* BMBT allocations can come through from non-transactional context. */ cur = kmem_cache_zalloc(cache, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); cur->bc_ops = ops; cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_maxlevels = maxlevels; cur->bc_cache = cache; return cur; } int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); /* Does this level of the cursor point to the inode root (and not a block)? */ static inline bool xfs_btree_at_iroot( const struct xfs_btree_cur *cur, int level) { return cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 1; } int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *start, union xfs_btree_ptr *newp, int *stat); int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur, struct xfs_buf *bp); #endif /* __XFS_BTREE_H__ */ |
| 1 1 9 5 2 107 103 107 11 2 4 107 107 19 19 107 107 19 16 16 107 107 107 107 22 1 9 97 97 7 10 10 103 5 103 14 15 19 19 19 103 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 | // SPDX-License-Identifier: GPL-2.0+ /* * NILFS recovery logic * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/crc32.h> #include "nilfs.h" #include "segment.h" #include "sufile.h" #include "page.h" #include "segbuf.h" /* * Segment check result */ enum { NILFS_SEG_VALID, NILFS_SEG_NO_SUPER_ROOT, NILFS_SEG_FAIL_IO, NILFS_SEG_FAIL_MAGIC, NILFS_SEG_FAIL_SEQ, NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, NILFS_SEG_FAIL_CHECKSUM_FULL, NILFS_SEG_FAIL_CONSISTENCY, }; /* work structure for recovery */ struct nilfs_recovery_block { ino_t ino; /* * Inode number of the file that this block * belongs to */ sector_t blocknr; /* block number */ __u64 vblocknr; /* virtual block number */ unsigned long blkoff; /* File offset of the data block (per block) */ struct list_head list; }; static int nilfs_warn_segment_error(struct super_block *sb, int err) { const char *msg = NULL; switch (err) { case NILFS_SEG_FAIL_IO: nilfs_err(sb, "I/O error reading segment"); return -EIO; case NILFS_SEG_FAIL_MAGIC: msg = "Magic number mismatch"; break; case NILFS_SEG_FAIL_SEQ: msg = "Sequence number mismatch"; break; case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: msg = "Checksum error in super root"; break; case NILFS_SEG_FAIL_CHECKSUM_FULL: msg = "Checksum error in segment payload"; break; case NILFS_SEG_FAIL_CONSISTENCY: msg = "Inconsistency found"; break; case NILFS_SEG_NO_SUPER_ROOT: msg = "No super root in the last segment"; break; default: nilfs_err(sb, "unrecognized segment error %d", err); return -EINVAL; } nilfs_warn(sb, "invalid segment: %s", msg); return -EINVAL; } /** * nilfs_compute_checksum - compute checksum of blocks continuously * @nilfs: nilfs object * @bhs: buffer head of start block * @sum: place to store result * @offset: offset bytes in the first block * @check_bytes: number of bytes to be checked * @start: DBN of start block * @nblock: number of blocks to be checked * * Return: 0 on success, or %-EIO if an I/O error occurs. */ static int nilfs_compute_checksum(struct the_nilfs *nilfs, struct buffer_head *bhs, u32 *sum, unsigned long offset, u64 check_bytes, sector_t start, unsigned long nblock) { unsigned int blocksize = nilfs->ns_blocksize; unsigned long size; u32 crc; BUG_ON(offset >= blocksize); check_bytes -= offset; size = min_t(u64, check_bytes, blocksize - offset); crc = crc32_le(nilfs->ns_crc_seed, (unsigned char *)bhs->b_data + offset, size); if (--nblock > 0) { do { struct buffer_head *bh; bh = __bread(nilfs->ns_bdev, ++start, blocksize); if (!bh) return -EIO; check_bytes -= size; size = min_t(u64, check_bytes, blocksize); crc = crc32_le(crc, bh->b_data, size); brelse(bh); } while (--nblock > 0); } *sum = crc; return 0; } /** * nilfs_read_super_root_block - read super root block * @nilfs: nilfs object * @sr_block: disk block number of the super root block * @pbh: address of a buffer_head pointer to return super root buffer * @check: CRC check flag * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Super root block corrupted. * * %-EIO - I/O error. */ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, struct buffer_head **pbh, int check) { struct buffer_head *bh_sr; struct nilfs_super_root *sr; u32 crc; int ret; *pbh = NULL; bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); if (unlikely(!bh_sr)) { ret = NILFS_SEG_FAIL_IO; goto failed; } sr = (struct nilfs_super_root *)bh_sr->b_data; if (check) { unsigned int bytes = le16_to_cpu(sr->sr_bytes); if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } if (nilfs_compute_checksum( nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, sr_block, 1)) { ret = NILFS_SEG_FAIL_IO; goto failed_bh; } if (crc != le32_to_cpu(sr->sr_sum)) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } } *pbh = bh_sr; return 0; failed_bh: brelse(bh_sr); failed: return nilfs_warn_segment_error(nilfs->ns_sb, ret); } /** * nilfs_read_log_header - read summary header of the specified log * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: pointer to return segment summary structure * * Return: Buffer head pointer, or NULL if an I/O error occurs. */ static struct buffer_head * nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary **sum) { struct buffer_head *bh_sum; bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (bh_sum) *sum = (struct nilfs_segment_summary *)bh_sum->b_data; return bh_sum; } /** * nilfs_validate_log - verify consistency of log * @nilfs: nilfs object * @seg_seq: sequence number of segment * @bh_sum: buffer head of summary block * @sum: segment summary struct * * Return: 0 on success, or one of the following internal codes on failure: * * %NILFS_SEG_FAIL_MAGIC - Magic number mismatch. * * %NILFS_SEG_FAIL_SEQ - Sequence number mismatch. * * %NIFLS_SEG_FAIL_CONSISTENCY - Block count out of range. * * %NILFS_SEG_FAIL_IO - I/O error. * * %NILFS_SEG_FAIL_CHECKSUM_FULL - Full log checksum verification failed. */ static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, struct buffer_head *bh_sum, struct nilfs_segment_summary *sum) { unsigned long nblock; u32 crc; int ret; ret = NILFS_SEG_FAIL_MAGIC; if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) goto out; ret = NILFS_SEG_FAIL_SEQ; if (le64_to_cpu(sum->ss_seq) != seg_seq) goto out; nblock = le32_to_cpu(sum->ss_nblocks); ret = NILFS_SEG_FAIL_CONSISTENCY; if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) /* This limits the number of blocks read in the CRC check */ goto out; ret = NILFS_SEG_FAIL_IO; if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), ((u64)nblock << nilfs->ns_blocksize_bits), bh_sum->b_blocknr, nblock)) goto out; ret = NILFS_SEG_FAIL_CHECKSUM_FULL; if (crc != le32_to_cpu(sum->ss_datasum)) goto out; ret = 0; out: return ret; } /** * nilfs_read_summary_info - read an item on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be read * * Return: Kernel space address of current segment summary entry, or * NULL if an I/O error occurs. */ static void *nilfs_read_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes) { void *ptr; sector_t blocknr; BUG_ON((*pbh)->b_size < *offset); if (bytes > (*pbh)->b_size - *offset) { blocknr = (*pbh)->b_blocknr; brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + 1, nilfs->ns_blocksize); if (unlikely(!*pbh)) return NULL; *offset = 0; } ptr = (*pbh)->b_data + *offset; *offset += bytes; return ptr; } /** * nilfs_skip_summary_info - skip items on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be skipped * @count: number of items to be skipped */ static void nilfs_skip_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes, unsigned long count) { unsigned int rest_item_in_current_block = ((*pbh)->b_size - *offset) / bytes; if (count <= rest_item_in_current_block) { *offset += bytes * count; } else { sector_t blocknr = (*pbh)->b_blocknr; unsigned int nitem_per_block = (*pbh)->b_size / bytes; unsigned int bcnt; count -= rest_item_in_current_block; bcnt = DIV_ROUND_UP(count, nitem_per_block); *offset = bytes * (count - (bcnt - 1) * nitem_per_block); brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, nilfs->ns_blocksize); } } /** * nilfs_scan_dsync_log - get block information of a log written for data sync * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: log summary information * @head: list head to add nilfs_recovery_block struct * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary *sum, struct list_head *head) { struct buffer_head *bh; unsigned int offset; u32 nfinfo, sumbytes; sector_t blocknr; ino_t ino; int err = -EIO; nfinfo = le32_to_cpu(sum->ss_nfinfo); if (!nfinfo) return 0; sumbytes = le32_to_cpu(sum->ss_sumbytes); blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (unlikely(!bh)) goto out; offset = le16_to_cpu(sum->ss_bytes); for (;;) { unsigned long nblocks, ndatablk, nnodeblk; struct nilfs_finfo *finfo; finfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*finfo)); if (unlikely(!finfo)) goto out; ino = le64_to_cpu(finfo->fi_ino); nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); nnodeblk = nblocks - ndatablk; while (ndatablk-- > 0) { struct nilfs_recovery_block *rb; struct nilfs_binfo_v *binfo; binfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*binfo)); if (unlikely(!binfo)) goto out; rb = kmalloc_obj(*rb, GFP_NOFS); if (unlikely(!rb)) { err = -ENOMEM; goto out; } rb->ino = ino; rb->blocknr = blocknr++; rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); rb->blkoff = le64_to_cpu(binfo->bi_blkoff); /* INIT_LIST_HEAD(&rb->list); */ list_add_tail(&rb->list, head); } if (--nfinfo == 0) break; blocknr += nnodeblk; /* always 0 for data sync logs */ nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), nnodeblk); if (unlikely(!bh)) goto out; } err = 0; out: brelse(bh); /* brelse(NULL) is just ignored */ return err; } static void dispose_recovery_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_recovery_block *rb; rb = list_first_entry(head, struct nilfs_recovery_block, list); list_del(&rb->list); kfree(rb); } } struct nilfs_segment_entry { struct list_head list; __u64 segnum; }; static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) { struct nilfs_segment_entry *ent = kmalloc_obj(*ent, GFP_NOFS); if (unlikely(!ent)) return -ENOMEM; ent->segnum = segnum; INIT_LIST_HEAD(&ent->list); list_add_tail(&ent->list, head); return 0; } void nilfs_dispose_segment_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_segment_entry *ent; ent = list_first_entry(head, struct nilfs_segment_entry, list); list_del(&ent->list); kfree(ent); } } static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct list_head *head = &ri->ri_used_segments; struct nilfs_segment_entry *ent, *n; struct inode *sufile = nilfs->ns_sufile; __u64 segnum[4]; int err; int i; segnum[0] = nilfs->ns_segnum; segnum[1] = nilfs->ns_nextnum; segnum[2] = ri->ri_segnum; segnum[3] = ri->ri_nextnum; /* * Releasing the next segment of the latest super root. * The next segment is invalidated by this recovery. */ err = nilfs_sufile_free(sufile, segnum[1]); if (unlikely(err)) { if (err == -ENOENT) { nilfs_err(sb, "checkpoint log inconsistency at block %llu (segment %llu): next segment %llu is unallocated", (unsigned long long)nilfs->ns_last_pseg, (unsigned long long)nilfs->ns_segnum, (unsigned long long)segnum[1]); err = -EINVAL; } goto failed; } for (i = 1; i < 4; i++) { err = nilfs_segment_list_add(head, segnum[i]); if (unlikely(err)) goto failed; } /* * Collecting segments written after the latest super root. * These are marked dirty to avoid being reallocated in the next write. */ list_for_each_entry_safe(ent, n, head, list) { if (ent->segnum != segnum[0]) { err = nilfs_sufile_scrap(sufile, ent->segnum); if (unlikely(err)) goto failed; } list_del(&ent->list); kfree(ent); } /* Allocate new segments for recovery */ err = nilfs_sufile_alloc(sufile, &segnum[0]); if (unlikely(err)) goto failed; nilfs->ns_pseg_offset = 0; nilfs->ns_seg_seq = ri->ri_seq + 2; nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; failed: /* No need to recover sufile because it will be destroyed on error */ return err; } static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, loff_t pos, struct folio *folio) { struct buffer_head *bh_org; size_t from = offset_in_folio(folio, pos); bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; memcpy_to_folio(folio, from, bh_org->b_data, bh_org->b_size); brelse(bh_org); return 0; } static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct list_head *head, unsigned long *nr_salvaged_blocks) { struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; struct folio *folio; loff_t pos; int err = 0, err2 = 0; list_for_each_entry_safe(rb, n, head, list) { inode = nilfs_iget(sb, root, rb->ino); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto failed_inode; } pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; if (pos + blocksize > isize) nilfs_write_failed(inode->i_mapping, pos + blocksize); goto failed_inode; } err = nilfs_recovery_copy_block(nilfs, rb, pos, folio); if (unlikely(err)) goto failed_folio; err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) goto failed_folio; block_write_end(pos, blocksize, blocksize, folio); folio_unlock(folio); folio_put(folio); (*nr_salvaged_blocks)++; goto next; failed_folio: folio_unlock(folio); folio_put(folio); failed_inode: nilfs_warn(sb, "error %d recovering data block (ino=%lu, block-offset=%llu)", err, (unsigned long)rb->ino, (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: iput(inode); /* iput(NULL) is just ignored */ list_del_init(&rb->list); kfree(rb); } return err2; } /** * nilfs_do_roll_forward - salvage logical segments newer than the latest * checkpoint * @nilfs: nilfs object * @sb: super block instance * @root: NILFS root instance * @ri: pointer to a nilfs_recovery_info * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Log format error. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; int empty_seg = 0; int err = 0, ret; LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ enum { RF_INIT_ST, RF_DSYNC_ST, /* scanning data-sync segments */ }; int state = RF_INIT_ST; pseg_start = ri->ri_lsegs_start; seg_seq = ri->ri_lsegs_start_seq; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { brelse(bh_sum); bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) { err = -EIO; goto failed; } ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) { err = -EIO; goto failed; } goto strayed; } flags = le16_to_cpu(sum->ss_flags); if (flags & NILFS_SS_SR) goto confused; /* Found a valid partial segment; do recovery actions */ nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); empty_seg = 0; nilfs->ns_ctime = le64_to_cpu(sum->ss_create); if (!(flags & NILFS_SS_GC)) nilfs->ns_nongc_ctime = nilfs->ns_ctime; switch (state) { case RF_INIT_ST: if (!(flags & NILFS_SS_LOGBGN) || !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; fallthrough; case RF_DSYNC_ST: if (!(flags & NILFS_SS_SYNDT)) goto confused; err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, &dsync_blocks); if (unlikely(err)) goto failed; if (flags & NILFS_SS_LOGEND) { err = nilfs_recover_dsync_blocks( nilfs, sb, root, &dsync_blocks, &nsalvaged_blocks); if (unlikely(err)) goto failed; state = RF_INIT_ST; } break; /* Fall through to try_next_pseg */ } try_next_pseg: if (pseg_start == ri->ri_lsegs_end) break; pseg_start += le32_to_cpu(sum->ss_nblocks); if (pseg_start < seg_end) continue; goto feed_segment; strayed: if (pseg_start == ri->ri_lsegs_end) break; feed_segment: /* Looking to the next full segment */ if (empty_seg++) break; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } if (nsalvaged_blocks) { nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks); ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: brelse(bh_sum); dispose_recovery_list(&dsync_blocks); return err; confused: err = -EINVAL; failed: nilfs_err(sb, "error %d roll-forwarding partial segment at blocknr = %llu", err, (unsigned long long)pseg_start); goto out; } static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh; int err; if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) return; bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); if (WARN_ON(!bh)) return; /* should never happen */ lock_buffer(bh); memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); set_buffer_dirty(bh); unlock_buffer(bh); err = sync_dirty_buffer(bh); if (unlikely(err)) nilfs_warn(nilfs->ns_sb, "buffer sync write failed during post-cleaning of recovery."); brelse(bh); } /** * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery * @nilfs: nilfs object */ static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; LIST_HEAD(head); /* Abandon inodes that have read recovery data */ spin_lock(&nilfs->ns_inode_lock); list_splice_init(&nilfs->ns_dirty_files, &head); spin_unlock(&nilfs->ns_inode_lock); if (list_empty(&head)) return; set_nilfs_purging(nilfs); list_for_each_entry_safe(ii, n, &head, i_dirty) { spin_lock(&nilfs->ns_inode_lock); list_del_init(&ii->i_dirty); spin_unlock(&nilfs->ns_inode_lock); iput(&ii->vfs_inode); } clear_nilfs_purging(nilfs); } /** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object * @sb: super block instance * @ri: pointer to a nilfs_recovery_info struct to store search results. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Inconsistent filesystem state. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No space left on device (only in a panic state). * * %-ERESTARTSYS - Interrupted. */ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct nilfs_root *root; int err; if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) return 0; err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); if (unlikely(err)) { nilfs_err(sb, "error %d loading the latest checkpoint", err); return err; } err = nilfs_do_roll_forward(nilfs, sb, root, ri); if (unlikely(err)) goto failed; if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); if (unlikely(err)) { nilfs_err(sb, "error %d preparing segment for recovery", err); goto failed; } err = nilfs_attach_log_writer(sb, root); if (unlikely(err)) goto failed; set_nilfs_discontinued(nilfs); err = nilfs_construct_segment(sb); nilfs_detach_log_writer(sb); if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); goto put_root; } nilfs_finish_roll_forward(nilfs, ri); } put_root: nilfs_put_root(root); return err; failed: nilfs_abort_roll_forward(nilfs); goto put_root; } /** * nilfs_search_super_root - search the latest valid super root * @nilfs: the_nilfs * @ri: pointer to a nilfs_recovery_info struct to store search results. * * nilfs_search_super_root() looks for the latest super-root from a partial * segment pointed by the superblock. It sets up struct the_nilfs through * this search. It fills nilfs_recovery_info (ri) required for recovery. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - No valid segment found. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; unsigned long nblocks; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; LIST_HEAD(segments); int empty_seg = 0, scan_newer = 0; int ret; pseg_start = nilfs->ns_last_pseg; seg_seq = nilfs->ns_last_seq; cno = nilfs->ns_last_cno; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); /* Calculate range of segment */ nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); /* Read ahead segment */ b = seg_start; while (b <= seg_end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); for (;;) { brelse(bh_sum); ret = NILFS_SEG_FAIL_IO; bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) goto failed; ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) goto failed; goto strayed; } nblocks = le32_to_cpu(sum->ss_nblocks); pseg_end = pseg_start + nblocks - 1; if (unlikely(pseg_end > seg_end)) { ret = NILFS_SEG_FAIL_CONSISTENCY; goto strayed; } /* A valid partial segment */ ri->ri_pseg_start = pseg_start; ri->ri_seq = seg_seq; ri->ri_segnum = segnum; nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); ri->ri_nextnum = nextnum; empty_seg = 0; flags = le16_to_cpu(sum->ss_flags); if (!(flags & NILFS_SS_SR) && !scan_newer) { /* * This will never happen because a superblock * (last_segment) always points to a pseg with * a super root. */ ret = NILFS_SEG_FAIL_CONSISTENCY; goto failed; } if (pseg_start == seg_start) { nilfs_get_segment_range(nilfs, nextnum, &b, &end); while (b <= end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); } if (!(flags & NILFS_SS_SR)) { if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; } if (flags & NILFS_SS_LOGEND) ri->ri_lsegs_end = pseg_start; goto try_next_pseg; } /* A valid super root was found. */ ri->ri_cno = cno++; ri->ri_super_root = pseg_end; ri->ri_lsegs_start = ri->ri_lsegs_end = 0; nilfs_dispose_segment_list(&segments); sr_pseg_start = pseg_start; nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; nilfs->ns_seg_seq = seg_seq; nilfs->ns_segnum = segnum; nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ nilfs->ns_ctime = le64_to_cpu(sum->ss_create); nilfs->ns_nextnum = nextnum; if (scan_newer) ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; else { if (nilfs->ns_mount_state & NILFS_VALID_FS) goto super_root_found; scan_newer = 1; } try_next_pseg: /* Standing on a course, or met an inconsistent state */ pseg_start += nblocks; if (pseg_start < seg_end) continue; goto feed_segment; strayed: /* Off the trail */ if (!scan_newer) /* * This can happen if a checkpoint was written without * barriers, or as a result of an I/O failure. */ goto failed; feed_segment: /* Looking to the next full segment */ if (empty_seg++) goto super_root_found; /* found a valid super root */ ret = nilfs_segment_list_add(&segments, segnum); if (unlikely(ret)) goto failed; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } super_root_found: /* Updating pointers relating to the latest checkpoint */ brelse(bh_sum); list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; nilfs->ns_last_cno = ri->ri_cno; return 0; failed: brelse(bh_sum); nilfs_dispose_segment_list(&segments); return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret); } |
| 10 10 2 4 2 3 3 2 3 2 11 2 2 2 2 7 7 10 9 10 10 3 1 3 3 5 3 4 2 2 6 6 6 6 6 6 1 1 6 2 10 6 10 10 5 5 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 5 10 10 10 10 9 4 6 6 10 10 5 5 5 3 5 5 5 3 5 5 11 1 17 17 1 1 17 9 6 11 4 15 1 10 1 1 1 18 1 1 6 5 6 10 11 11 11 11 10 1 10 10 5 5 11 23 1 1 2 19 14 5 7 5 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" #include "xfs_rmap.h" #include "xfs_alloc.h" #include "xfs_bit.h" #include <linux/fsmap.h> #include "xfs_fsmap.h" #include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_alloc_btree.h" #include "xfs_rtbitmap.h" #include "xfs_ag.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" /* Convert an xfs_fsmap to an fsmap. */ static void xfs_fsmap_from_internal( struct fsmap *dest, struct xfs_fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = BBTOB(src->fmr_physical); dest->fmr_owner = src->fmr_owner; dest->fmr_offset = BBTOB(src->fmr_offset); dest->fmr_length = BBTOB(src->fmr_length); dest->fmr_reserved[0] = 0; dest->fmr_reserved[1] = 0; dest->fmr_reserved[2] = 0; } /* Convert an fsmap to an xfs_fsmap. */ static void xfs_fsmap_to_internal( struct xfs_fsmap *dest, struct fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = BTOBBT(src->fmr_physical); dest->fmr_owner = src->fmr_owner; dest->fmr_offset = BTOBBT(src->fmr_offset); dest->fmr_length = BTOBBT(src->fmr_length); } /* Convert an fsmap owner into an rmapbt owner. */ static int xfs_fsmap_owner_to_rmap( struct xfs_rmap_irec *dest, const struct xfs_fsmap *src) { if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) { dest->rm_owner = src->fmr_owner; return 0; } switch (src->fmr_owner) { case 0: /* "lowest owner id possible" */ case -1ULL: /* "highest owner id possible" */ dest->rm_owner = src->fmr_owner; break; case XFS_FMR_OWN_FREE: dest->rm_owner = XFS_RMAP_OWN_NULL; break; case XFS_FMR_OWN_UNKNOWN: dest->rm_owner = XFS_RMAP_OWN_UNKNOWN; break; case XFS_FMR_OWN_FS: dest->rm_owner = XFS_RMAP_OWN_FS; break; case XFS_FMR_OWN_LOG: dest->rm_owner = XFS_RMAP_OWN_LOG; break; case XFS_FMR_OWN_AG: dest->rm_owner = XFS_RMAP_OWN_AG; break; case XFS_FMR_OWN_INOBT: dest->rm_owner = XFS_RMAP_OWN_INOBT; break; case XFS_FMR_OWN_INODES: dest->rm_owner = XFS_RMAP_OWN_INODES; break; case XFS_FMR_OWN_REFC: dest->rm_owner = XFS_RMAP_OWN_REFC; break; case XFS_FMR_OWN_COW: dest->rm_owner = XFS_RMAP_OWN_COW; break; case XFS_FMR_OWN_DEFECTIVE: /* not implemented */ /* fall through */ default: return -EINVAL; } return 0; } /* Convert an rmapbt owner into an fsmap owner. */ static int xfs_fsmap_owner_from_frec( struct xfs_fsmap *dest, const struct xfs_fsmap_irec *frec) { dest->fmr_flags = 0; if (!XFS_RMAP_NON_INODE_OWNER(frec->owner)) { dest->fmr_owner = frec->owner; return 0; } dest->fmr_flags |= FMR_OF_SPECIAL_OWNER; switch (frec->owner) { case XFS_RMAP_OWN_FS: dest->fmr_owner = XFS_FMR_OWN_FS; break; case XFS_RMAP_OWN_LOG: dest->fmr_owner = XFS_FMR_OWN_LOG; break; case XFS_RMAP_OWN_AG: dest->fmr_owner = XFS_FMR_OWN_AG; break; case XFS_RMAP_OWN_INOBT: dest->fmr_owner = XFS_FMR_OWN_INOBT; break; case XFS_RMAP_OWN_INODES: dest->fmr_owner = XFS_FMR_OWN_INODES; break; case XFS_RMAP_OWN_REFC: dest->fmr_owner = XFS_FMR_OWN_REFC; break; case XFS_RMAP_OWN_COW: dest->fmr_owner = XFS_FMR_OWN_COW; break; case XFS_RMAP_OWN_NULL: /* "free" */ dest->fmr_owner = XFS_FMR_OWN_FREE; break; default: ASSERT(0); return -EFSCORRUPTED; } return 0; } /* getfsmap query state */ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ struct xfs_group *group; /* group info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ /* daddr of low fsmap key when we're using the rtbitmap */ xfs_daddr_t low_daddr; /* daddr of high fsmap key, or the last daddr on the device */ xfs_daddr_t end_daddr; u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ /* * Low rmap key for the query. If low.rm_blockcount is nonzero, this * is the second (or later) call to retrieve the recordset in pieces. * xfs_getfsmap_rec_before_start will compare all records retrieved * by the rmapbt query to filter out any records that start before * the last record. */ struct xfs_rmap_irec low; struct xfs_rmap_irec high; /* high rmap key */ bool last; /* last extent? */ }; /* Associate a device with a getfsmap handler. */ struct xfs_getfsmap_dev { u32 dev; int (*fn)(struct xfs_trans *tp, const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info); sector_t nr_sectors; }; /* Compare two getfsmap device handlers. */ static int xfs_getfsmap_dev_compare( const void *p1, const void *p2) { const struct xfs_getfsmap_dev *d1 = p1; const struct xfs_getfsmap_dev *d2 = p2; return d1->dev - d2->dev; } /* Decide if this mapping is shared. */ STATIC int xfs_getfsmap_is_shared( struct xfs_trans *tp, struct xfs_getfsmap_info *info, const struct xfs_fsmap_irec *frec, bool *stat) { struct xfs_mount *mp = tp->t_mountp; struct xfs_btree_cur *cur; xfs_agblock_t fbno; xfs_extlen_t flen = 0; int error; *stat = false; if (!xfs_has_reflink(mp) || !info->group) return 0; if (info->group->xg_type == XG_TYPE_RTG) cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(info->group)); else cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, to_perag(info->group)); /* Are there any shared blocks here? */ error = xfs_refcount_find_shared(cur, frec->rec_key, XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen, false); xfs_btree_del_cursor(cur, error); if (error) return error; *stat = flen > 0; return 0; } static inline void xfs_getfsmap_format( struct xfs_mount *mp, struct xfs_fsmap *xfm, struct xfs_getfsmap_info *info) { struct fsmap *rec; trace_xfs_getfsmap_mapping(mp, xfm); rec = &info->fsmap_recs[info->head->fmh_entries++]; xfs_fsmap_from_internal(rec, xfm); } static inline bool xfs_getfsmap_frec_before_start( struct xfs_getfsmap_info *info, const struct xfs_fsmap_irec *frec) { if (info->low_daddr != XFS_BUF_DADDR_NULL) return frec->start_daddr < info->low_daddr; if (info->low.rm_blockcount) { struct xfs_rmap_irec rec = { .rm_startblock = frec->rec_key, .rm_owner = frec->owner, .rm_flags = frec->rm_flags, }; return xfs_rmap_compare(&rec, &info->low) < 0; } return false; } /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. Pass in a nonzero @len_daddr if the * length could be larger than rm_blockcount in struct xfs_rmap_irec. */ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, const struct xfs_fsmap_irec *frec) { struct xfs_fsmap fmr; struct xfs_mount *mp = tp->t_mountp; bool shared; int error = 0; if (fatal_signal_pending(current)) return -EINTR; /* * Filter out records that start before our startpoint, if the * caller requested that. */ if (xfs_getfsmap_frec_before_start(info, frec)) goto out; /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { if (info->head->fmh_entries == UINT_MAX) return -ECANCELED; if (frec->start_daddr > info->next_daddr) info->head->fmh_entries++; if (info->last) return 0; info->head->fmh_entries++; goto out; } /* * If the record starts past the last physical block we saw, * then we've found a gap. Report the gap as being owned by * whatever the caller specified is the missing owner. */ if (frec->start_daddr > info->next_daddr) { if (info->head->fmh_entries >= info->head->fmh_count) return -ECANCELED; fmr.fmr_device = info->dev; fmr.fmr_physical = info->next_daddr; fmr.fmr_owner = info->missing_owner; fmr.fmr_offset = 0; fmr.fmr_length = frec->start_daddr - info->next_daddr; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; xfs_getfsmap_format(mp, &fmr, info); } if (info->last) goto out; /* Fill out the extent we found */ if (info->head->fmh_entries >= info->head->fmh_count) return -ECANCELED; trace_xfs_fsmap_mapping(mp, info->dev, info->group ? info->group->xg_gno : NULLAGNUMBER, frec); fmr.fmr_device = info->dev; fmr.fmr_physical = frec->start_daddr; error = xfs_fsmap_owner_from_frec(&fmr, frec); if (error) return error; fmr.fmr_offset = XFS_FSB_TO_BB(mp, frec->offset); fmr.fmr_length = frec->len_daddr; if (frec->rm_flags & XFS_RMAP_UNWRITTEN) fmr.fmr_flags |= FMR_OF_PREALLOC; if (frec->rm_flags & XFS_RMAP_ATTR_FORK) fmr.fmr_flags |= FMR_OF_ATTR_FORK; if (frec->rm_flags & XFS_RMAP_BMBT_BLOCK) fmr.fmr_flags |= FMR_OF_EXTENT_MAP; if (fmr.fmr_flags == 0) { error = xfs_getfsmap_is_shared(tp, info, frec, &shared); if (error) return error; if (shared) fmr.fmr_flags |= FMR_OF_SHARED; } xfs_getfsmap_format(mp, &fmr, info); out: info->next_daddr = max(info->next_daddr, frec->start_daddr + frec->len_daddr); return 0; } static inline int xfs_getfsmap_group_helper( struct xfs_getfsmap_info *info, struct xfs_trans *tp, struct xfs_group *xg, xfs_agblock_t startblock, xfs_extlen_t blockcount, struct xfs_fsmap_irec *frec) { /* * For an info->last query, we're looking for a gap between the last * mapping emitted and the high key specified by userspace. If the * user's query spans less than 1 fsblock, then info->high and * info->low will have the same rm_startblock, which causes rec_daddr * and next_daddr to be the same. Therefore, use the end_daddr that * we calculated from userspace's high key to synthesize the record. * Note that if the btree query found a mapping, there won't be a gap. */ if (info->last) frec->start_daddr = info->end_daddr + 1; else frec->start_daddr = xfs_gbno_to_daddr(xg, startblock); frec->len_daddr = XFS_FSB_TO_BB(xg->xg_mount, blockcount); return xfs_getfsmap_helper(tp, info, frec); } /* Transform a rmapbt irec into a fsmap */ STATIC int xfs_getfsmap_rmapbt_helper( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *rec, void *priv) { struct xfs_fsmap_irec frec = { .owner = rec->rm_owner, .offset = rec->rm_offset, .rm_flags = rec->rm_flags, .rec_key = rec->rm_startblock, }; struct xfs_getfsmap_info *info = priv; return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, rec->rm_startblock, rec->rm_blockcount, &frec); } /* Transform a bnobt irec into a fsmap */ STATIC int xfs_getfsmap_datadev_bnobt_helper( struct xfs_btree_cur *cur, const struct xfs_alloc_rec_incore *rec, void *priv) { struct xfs_fsmap_irec frec = { .owner = XFS_RMAP_OWN_NULL, /* "free" */ .rec_key = rec->ar_startblock, }; struct xfs_getfsmap_info *info = priv; return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, rec->ar_startblock, rec->ar_blockcount, &frec); } /* Set rmap flags based on the getfsmap flags */ static void xfs_getfsmap_set_irec_flags( struct xfs_rmap_irec *irec, const struct xfs_fsmap *fmr) { irec->rm_flags = 0; if (fmr->fmr_flags & FMR_OF_ATTR_FORK) irec->rm_flags |= XFS_RMAP_ATTR_FORK; if (fmr->fmr_flags & FMR_OF_EXTENT_MAP) irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; if (fmr->fmr_flags & FMR_OF_PREALLOC) irec->rm_flags |= XFS_RMAP_UNWRITTEN; } static inline bool rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r) { if (!xfs_has_reflink(mp)) return true; if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner)) return true; if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | XFS_RMAP_UNWRITTEN)) return true; return false; } /* Execute a getfsmap query against the regular data device. */ STATIC int __xfs_getfsmap_datadev( struct xfs_trans *tp, const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info, int (*query_fn)(struct xfs_trans *, struct xfs_getfsmap_info *, struct xfs_btree_cur **, void *), void *priv) { struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag = NULL; struct xfs_btree_cur *bt_cur = NULL; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; xfs_agnumber_t start_ag, end_ag; uint64_t eofs; int error = 0; eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (keys[0].fmr_physical >= eofs) return 0; start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical); end_fsb = XFS_DADDR_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); /* * Convert the fsmap low/high keys to AG based keys. Initialize * low to the fsmap low key and max out the high key to the end * of the AG. */ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) return error; info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { /* No previous record from which to continue */ } else if (rmap_not_shareable(mp, &info->low)) { /* Last record seen was an unshareable extent */ info->low.rm_owner = 0; info->low.rm_offset = 0; start_fsb += info->low.rm_blockcount; if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) return 0; } else { /* Last record seen was a shareable file data extent */ info->low.rm_offset += info->low.rm_blockcount; } < |